updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Support mnnvl all2allv from Flashinfer (#21003 )
2025-09-24 14:41:00 -04:00 · 2025-09-24 14:38:16 -04:00 · 2025-09-24 18:11:16 +00:00 · 2025-09-24 08:30:15 -07:00 · 2025-09-24 11:25:10 -04:00 · 2025-09-24 11:24:39 -04:00
670 changed files with 23994 additions and 36637 deletions
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi

-if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
-  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
-fi
-
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
-    && python3 -m pip install --progress-bar off hf-transfer
+    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
-    && python3 -m pip install --progress-bar off hf-transfer
+    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
 export VLLM_XLA_CHECK_RECOMPILATION=1
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Setup script for Prime-RL integration tests
+# This script prepares the environment for running Prime-RL tests with nightly vLLM
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
+PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+
+echo "Setting up Prime-RL integration test environment..."
+
+# Clean up any existing Prime-RL directory
+if [ -d "${PRIME_RL_DIR}" ]; then
+    echo "Removing existing Prime-RL directory..."
+    rm -rf "${PRIME_RL_DIR}"
+fi
+
+# Install UV if not available
+if ! command -v uv &> /dev/null; then
+    echo "Installing UV package manager..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    source $HOME/.local/bin/env
+fi
+
+# Clone Prime-RL repository at specific branch for reproducible tests
+PRIME_RL_BRANCH="integ-vllm-main"
+echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
+git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
+cd "${PRIME_RL_DIR}"
+
+echo "Setting up UV project environment..."
+export UV_PROJECT_ENVIRONMENT=/usr/local
+ln -s /usr/bin/python3 /usr/local/bin/python
+
+# Remove vllm pin from pyproject.toml
+echo "Removing vllm pin from pyproject.toml..."
+sed -i '/vllm==/d' pyproject.toml
+
+# Sync Prime-RL dependencies
+echo "Installing Prime-RL dependencies..."
+uv sync --inexact && uv sync --inexact --all-extras
+
+# Verify installation
+echo "Verifying installations..."
+uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
+
+echo "Prime-RL integration test environment setup complete!"
+
+echo "Running Prime-RL integration tests..."
+export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
+uv run pytest -vs tests/integration/test_rl.py -m gpu
+
+echo "Prime-RL integration tests completed!"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -6,24 +6,28 @@
 # to generate the final pipeline yaml file.

 # Documentation
-# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
-# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
 # command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
-#     in this case, commands must be specified. the first command runs on first host, the second
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+#     in this case, commands must be specified. the first command runs on the first host, the second
 #     command runs on the second host.
-# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+#     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.

 # When adding a test
-# - If the test belong to an existing group, add it there
+# - If the test belongs to an existing group, add it there
 # - If the test is short, add to any existing step
 # - If the test takes more than 10min, then it is okay to create a new step.
 #   Note that all steps execute in parallel.
@ -110,7 +114,7 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Entrypoints Integration Test (API Server) # 100min
  timeout_in_minutes: 130
@ -148,7 +152,6 @@ steps:
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
-  - vllm/core/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
@ -161,12 +164,20 @@ steps:
  - tests/v1/test_internal_lb_dp.py
  - tests/v1/test_hybrid_lb_dp.py
  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
  commands:
-  # test with tp=2 and external_dp=2
-  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with tp=2 and pp=2
+  # test with torchrun tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=4 and dp=1
+  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2, pp=2 and dp=1
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=1 and dp=4 with ep
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2 and dp=2 with ep
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
@ -178,6 +189,7 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
@ -280,7 +292,7 @@ steps:
    # split the test to avoid interference
    - pytest -v -s v1/core
    - pytest -v -s v1/executor
-    - pytest -v -s v1/offloading
+    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
@ -314,12 +326,13 @@ steps:
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048

 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@ -869,13 +882,13 @@ steps:
  - tests/distributed/
  - vllm/compilation
  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
  - tests/v1/test_async_llm_dp.py
  - tests/v1/test_external_lb_dp.py
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
@ -894,9 +907,10 @@ steps:
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s models/multimodal/generation/test_maverick.py
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
@ -1030,3 +1044,16 @@ steps:
  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -4,11 +4,8 @@
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/fused_moe @mgoin
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
@ -66,18 +63,26 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/models/test_transformers.py @hmellor

 # Docs
-/docs @hmellor
+/docs/mkdocs @hmellor
+/docs/**/*.yml @hmellor
+/requirements/docs.txt @hmellor
+.readthedocs.yaml @hmellor
 mkdocs.yaml @hmellor

+# Linting
+.markdownlint.yaml @hmellor
+.pre-commit-config.yaml @hmellor
+/tools/pre_commit @hmellor
+
 # CPU
-/vllm/v1/worker/^cpu @bigPYJ1151
+/vllm/v1/worker/cpu* @bigPYJ1151
 /csrc/cpu @bigPYJ1151
 /vllm/platforms/cpu.py @bigPYJ1151
 /cmake/cpu_extension.cmake @bigPYJ1151
 /docker/Dockerfile.cpu @bigPYJ1151

 # Intel GPU
-/vllm/v1/worker/^xpu @jikunshang
+/vllm/v1/worker/xpu* @jikunshang
 /vllm/platforms/xpu.py @jikunshang
 /docker/Dockerfile.xpu @jikunshang

--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -43,10 +43,6 @@ body:
      Any other things you would like to mention.
  validations:
    required: false
- type: markdown
-  attributes:
-    value: >
-      Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
 - type: checkboxes
  id: askllm
  attributes:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -49,7 +49,7 @@ repos:
  rev: 0.6.17
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
@ -60,38 +60,32 @@ repos:
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
    name: Run mypy for local Python installation
-    entry: tools/mypy.sh 0 "local"
-    language: python
-    types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
+    entry: python tools/pre_commit/mypy.py 0 "local"
    stages: [pre-commit] # Don't run in CI
+    <<: &mypy_common
+      language: python
+      types_or: [python, pyi]
+      require_serial: true
+      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
-    entry: tools/mypy.sh 1 "3.9"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
+    entry: python tools/pre_commit/mypy.py 1 "3.9"
+    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
-    entry: tools/mypy.sh 1 "3.10"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
+    entry: python tools/pre_commit/mypy.py 1 "3.10"
+    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.11
-    entry: tools/mypy.sh 1 "3.11"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
+    entry: python tools/pre_commit/mypy.py 1 "3.11"
+    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.12
-    entry: tools/mypy.sh 1 "3.12"
-    language: python
-    types: [python]
-    additional_dependencies: *mypy_deps
+    entry: python tools/pre_commit/mypy.py 1 "3.12"
+    <<: *mypy_common
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
@ -155,11 +149,10 @@ repos:
    additional_dependencies: [regex]
  - id: check-pickle-imports
    name: Prevent new pickle/cloudpickle imports
-    entry: python tools/check_pickle_imports.py
+    entry: python tools/pre_commit/check_pickle_imports.py
    language: python
    types: [python]
-    pass_filenames: false
-    additional_dependencies: [pathspec, regex]
+    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/validate_config.py
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -103,10 +103,15 @@ start_server() {
        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    fi
+    local server_pid=$!

    # wait for 10 minutes...
    server_started=0
    for i in {1..60}; do
+        # This line checks whether the server is still alive or not,
+        # since that we should always have permission to send signal to the server process.
+        kill -0 $server_pid 2> /dev/null || break
+
        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
        if [[ "$STATUS_CODE" -eq 200 ]]; then
@ -118,7 +123,7 @@ start_server() {
    done

    if (( ! server_started )); then
-        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
+        echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
        return 1
    else
        return 0
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -449,7 +449,8 @@ async def benchmark(
    def prepare_extra_body(request) -> dict:
        extra_body = {}
        # Add the schema to the extra_body
-        extra_body[request.structure_type] = request.schema
+        extra_body["structured_outputs"] = {}
+        extra_body["structured_outputs"][request.structure_type] = request.schema
        return extra_body

    print("Starting initial single prompt test run...")
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_block_fp8_matmul,
+    w8a8_triton_block_scaled_mm,
 )
 from vllm.utils import FlexibleArgumentParser, cdiv

@ -158,7 +158,7 @@ def bench_fp8(
        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
        ),
-        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
        ),
        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -51,7 +51,7 @@ def calculate_diff(
 ):
    """Calculate the difference between Inductor and CUDA implementations."""
    device = torch.device("cuda")
-    x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
+    x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)

    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)

@ -59,23 +59,25 @@ def calculate_diff(
    torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
    cuda_out, cuda_scale = quant_fp8.forward_cuda(x)

-    out_allclose = lambda o1, o2: torch.allclose(
-        o1.to(torch.float32),
-        o2.to(torch.float32),
-        rtol=1e-3,
-        atol=1e-5,
-    )
-    scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
-
-    if (
-        out_allclose(cuda_out, torch_out)
-        and scale_allclose(cuda_scale, torch_scale)
-        and out_allclose(cuda_out, torch_eager_out)
-        and scale_allclose(cuda_scale, torch_eager_scale)
-    ):
+    try:
+        torch.testing.assert_close(
+            cuda_out.to(torch.float32),
+            torch_out.to(torch.float32),
+            rtol=1e-3,
+            atol=1e-5,
+        )
+        torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
+        torch.testing.assert_close(
+            cuda_out.to(torch.float32),
+            torch_eager_out.to(torch.float32),
+            rtol=1e-3,
+            atol=1e-5,
+        )
+        torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
        print("✅ All implementations match")
-    else:
+    except AssertionError as e:
        print("❌ Implementations differ")
+        print(e)


 configs = []
@ -91,7 +93,7 @@ def benchmark_quantization(
 ):
    device = torch.device("cuda")

-    x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
+    x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)

    quantiles = [0.5, 0.2, 0.8]
    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
@ -157,21 +159,21 @@ if __name__ == "__main__":
    )
    parser.add_argument("-c", "--check", action="store_true")
    parser.add_argument(
-        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
    )
    parser.add_argument(
        "--hidden-sizes",
        type=int,
        nargs="+",
-        default=None,
-        help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
+        default=[896, 1024, 2048, 4096, 7168],
+        help="Hidden sizes to benchmark",
    )
    parser.add_argument(
        "--batch-sizes",
        type=int,
        nargs="+",
-        default=None,
-        help="Batch sizes to benchmark (default: 1,16,32,64,128)",
+        default=[1, 16, 128, 512, 1024],
+        help="Batch sizes to benchmark",
    )
    parser.add_argument(
        "--group-sizes",
@ -192,8 +194,8 @@ if __name__ == "__main__":

    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]

-    hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
-    batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
+    hidden_sizes = args.hidden_sizes
+    batch_sizes = args.batch_sizes

    if args.group_sizes is not None:
        group_shapes = []
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@ -0,0 +1,406 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
+kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
+but use different quantization strategies and backends.
+"""
+
+import nvtx
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+# Weight shapes for different models: [num_experts, topk, hidden_size,
+# intermediate_size]
+WEIGHT_SHAPES_MOE = {
+    "mixtral-8x7b": [
+        [8, 2, 4096, 14336],
+    ],
+    "deepseek-v2": [
+        [160, 6, 5120, 12288],
+    ],
+    "custom-small": [
+        [8, 2, 2048, 7168],
+    ],
+    "glm45-fp8": [
+        [128, 8, 4096, 1408],
+    ],
+    "Llama-4-Maverick-17B-128E-Instruct-FP8": [
+        [128, 1, 5120, 8192],
+    ],
+}
+
+DEFAULT_MODELS = [
+    "mixtral-8x7b",
+]
+
+DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False, True]
+PER_OUT_CH_OPTS = [False, True]
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+def bench_run(
+    results: list,
+    model: str,
+    num_experts: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    mkn: tuple[int, int, int],
+):
+    (m, k, n) = mkn
+
+    dtype = torch.half
+    device = "cuda"
+
+    # Create input activations
+    a = torch.randn((m, k), device=device, dtype=dtype) / 10
+
+    # Create weights
+    w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
+    w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
+
+    # Create FP8 quantized weights and scales for both kernels
+    w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
+    w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
+
+    # Create scales based on quantization strategy
+    if per_out_ch:
+        # Per-channel quantization
+        w1_scale = torch.empty(
+            (num_experts, 2 * n, 1), device=device, dtype=torch.float32
+        )
+        w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
+    else:
+        # Per-tensor quantization
+        w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+        w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+
+    # Quantize weights
+    for expert in range(num_experts):
+        if per_out_ch:
+            # Per-channel quantization - not yet implemented properly
+            # For now, fall back to per-tensor quantization
+            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
+            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
+            # Expand scalar scales to the expected per-channel shape
+            w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
+            w2_scale[expert] = w2_scale_temp.expand(k, 1)
+        else:
+            # Per-tensor quantization
+            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
+            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
+            # Store scalar scales in [1, 1] tensors
+            w1_scale[expert, 0, 0] = w1_scale_temp
+            w2_scale[expert, 0, 0] = w2_scale_temp
+
+    # Prepare weights for CUTLASS (no transpose needed)
+    w1_fp8q_cutlass = w1_fp8q  # Keep original [E, 2N, K]
+    w2_fp8q_cutlass = w2_fp8q  # Keep original [E, K, N]
+
+    # Create router scores and get topk
+    score = torch.randn((m, num_experts), device=device, dtype=dtype)
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
+
+    # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
+    # Force per-tensor quantization for all cases to match working e2e setup
+    a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
+    a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
+
+    # Force per-tensor quantization for all cases
+    per_act_token = False
+
+    # Create stride tensors for CUTLASS
+    ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
+    ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
+    c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
+    c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
+
+    def run_triton_moe(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a1_scale: torch.Tensor,
+        a2_scale: torch.Tensor,
+        num_repeats: int,
+    ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            per_act_token_quant=per_act_token,
+            per_out_ch_quant=per_out_ch,
+        )
+
+        for _ in range(num_repeats):
+            fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                quant_config=quant_config,
+            )
+
+    def run_cutlass_moe_fp8(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a1_scale: torch.Tensor,
+        a2_scale: torch.Tensor,
+        num_repeats: int,
+    ):
+        quant_config = fp8_w8a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            per_act_token_quant=per_act_token,
+            per_out_ch_quant=per_out_ch,
+        )
+
+        for _ in range(num_repeats):
+            with nvtx.annotate("cutlass_moe_fp8", color="blue"):
+                cutlass_moe_fp8(
+                    a=a,
+                    w1_q=w1,
+                    w2_q=w2,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    ab_strides1=ab_strides1,
+                    ab_strides2=ab_strides2,
+                    c_strides1=c_strides1,
+                    c_strides2=c_strides2,
+                    quant_config=quant_config,
+                    activation="silu",
+                    global_num_experts=num_experts,
+                )
+
+    # Pre-create quantization config to avoid creating it inside CUDA graph
+    quant_config = fp8_w8a8_moe_quant_config(
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        per_act_token_quant=per_act_token,
+        per_out_ch_quant=per_out_ch,
+    )
+
+    # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
+    cutlass_stream = torch.cuda.Stream()
+    cutlass_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+        # Capture 10 invocations like benchmark_moe.py
+        for _ in range(10):
+            cutlass_moe_fp8(
+                a=a,
+                w1_q=w1_fp8q_cutlass,
+                w2_q=w2_fp8q_cutlass,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                ab_strides1=ab_strides1,
+                ab_strides2=ab_strides2,
+                c_strides1=c_strides1,
+                c_strides2=c_strides2,
+                quant_config=quant_config,
+                activation="silu",
+                global_num_experts=num_experts,
+            )
+    torch.cuda.synchronize()
+
+    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
+    triton_stream = torch.cuda.Stream()
+    triton_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(triton_graph, stream=triton_stream):
+        # Capture 10 invocations like benchmark_moe.py
+        for _ in range(10):
+            fused_experts(
+                a,
+                w1_fp8q,
+                w2_fp8q,
+                topk_weights,
+                topk_ids,
+                quant_config=quant_config,
+            )
+    torch.cuda.synchronize()
+
+    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
+        """Benchmark CUDA graph using events like benchmark_moe.py"""
+        # Warmup
+        for _ in range(num_warmup):
+            graph.replay()
+        torch.cuda.synchronize()
+
+        # Timing
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        latencies = []
+        for _ in range(num_iters):
+            torch.cuda.synchronize()
+            start_event.record()
+            graph.replay()
+            end_event.record()
+            end_event.synchronize()
+            latencies.append(start_event.elapsed_time(end_event))
+
+        # Divide by 10 since graph contains 10 calls
+        return sum(latencies) / (num_iters * 10)
+
+    # Benchmark parameters
+    num_warmup = 5
+    num_iters = 100
+
+    # Benchmark only CUDA graphs (more reliable and faster)
+    # Benchmark Triton MoE with CUDA graphs
+    triton_graph_time = bench_cuda_graph(
+        triton_graph, num_warmup=num_warmup, num_iters=num_iters
+    )
+
+    # Benchmark CUTLASS MoE with CUDA graphs
+    cutlass_graph_time = bench_cuda_graph(
+        cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
+    )
+
+    # Convert ms to us and return results
+    triton_time_us = triton_graph_time * 1000
+    cutlass_time_us = cutlass_graph_time * 1000
+
+    return {
+        "batch_size": m,
+        "triton_time_us": triton_time_us,
+        "cutlass_time_us": cutlass_time_us,
+    }
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    all_results = []
+
+    for model in args.models:
+        for tp in args.tp_sizes:
+            for layer in WEIGHT_SHAPES_MOE[model]:
+                num_experts = layer[0]
+                topk = layer[1]
+                size_k = layer[2]
+                size_n = layer[3] // tp
+
+                if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                    continue
+
+                if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                    continue
+
+                for per_act_token in args.per_act_token_opts:
+                    for per_out_ch in args.per_out_ch_opts:
+                        print(
+                            f"\n=== {model}, experts={num_experts}, topk={topk},"
+                            f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
+                        )
+
+                        config_results = []
+                        for size_m in args.batch_sizes:
+                            mkn = (size_m, size_k, size_n)
+                            result = bench_run(
+                                [],  # Not used anymore
+                                model,
+                                num_experts,
+                                topk,
+                                per_act_token,
+                                per_out_ch,
+                                mkn,
+                            )
+                            if result:
+                                config_results.append(result)
+
+                        # Print results table for this configuration
+                        if config_results:
+                            print(
+                                f"\n{'Batch Size':<12}"
+                                f"{'Triton (us)':<15}"
+                                f"{'CUTLASS (us)':<15}"
+                            )
+                            print("-" * 45)
+                            for result in config_results:
+                                print(
+                                    f"{result['batch_size']:<12}"
+                                    f"{result['triton_time_us']:<15.2f}"
+                                    f"{result['cutlass_time_us']:<15.2f}"
+                                )
+
+                            all_results.extend(config_results)
+
+    print(f"\nTotal benchmarks completed: {len(all_results)}")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
+         across specified models/shapes/batches
+
+        Example usage:
+        python benchmark_cutlass_moe_fp8.py  \
+            --model "Llama-4-Maverick-17B-128E-Instruct-FP8"  \
+            --tp-sizes 8 \
+            --batch-size 2 4 8  \
+            --per-act-token-opts false \
+            --per-out-ch-opts false
+
+        """
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES_MOE.keys(),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
+    parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument(
+        "--per-act-token-opts",
+        nargs="+",
+        type=lambda x: x.lower() == "true",
+        default=[False, True],
+        help="Per-activation token quantization options (true/false)",
+    )
+    parser.add_argument(
+        "--per-out-ch-opts",
+        nargs="+",
+        type=lambda x: x.lower() == "true",
+        default=[False, True],
+        help="Per-output channel quantization options (true/false)",
+    )
+
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -7,6 +7,10 @@ Benchmark script for device communicators:
 CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
 and SymmMemCommunicator (multimem, two-shot).

+for NCCL symmetric memory you need to set the environment variables
+NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
+not use fast NVLS implementation for all reduce.
+
 Usage:
    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]

@ -26,7 +30,13 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroup

 from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
-from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.pynccl import (
+    PyNcclCommunicator,
+    register_nccl_symmetric_ops,
+)
+from vllm.distributed.device_communicators.pynccl_allocator import (
+    set_graph_pool_id,
+)
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
 from vllm.utils import FlexibleArgumentParser
@ -98,6 +108,7 @@ class CommunicatorBenchmark:
            )
            if not self.pynccl_comm.disabled:
                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+                register_nccl_symmetric_ops(self.pynccl_comm)
            else:
                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
                self.pynccl_comm = None
@ -194,6 +205,15 @@ class CommunicatorBenchmark:
                    None,  # no env variable needed
                )
            )
+            communicators.append(
+                (
+                    "pynccl-symm",
+                    lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
+                    lambda t: True,  # Always available if initialized
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )

        if self.symm_mem_comm_multimem is not None:
            comm = self.symm_mem_comm_multimem
@ -271,7 +291,9 @@ class CommunicatorBenchmark:
                # Capture the graph using context manager
                with context:
                    graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph):
+                    graph_pool = torch.cuda.graph_pool_handle()
+                    set_graph_pool_id(graph_pool)
+                    with torch.cuda.graph(graph, pool=graph_pool):
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)

--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -9,6 +9,9 @@ import torch
 from tabulate import tabulate

 from vllm import _custom_ops as ops
+from vllm.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash,
+)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (
@ -31,6 +34,8 @@ def run_benchmark(
    kv_cache_dtype: str,
    kv_cache_layout: str,
    num_iters: int,
+    implementation: str,
+    benchmark_mode: str,
    device: str = "cuda",
 ) -> float:
    """Return latency (seconds) for given num_tokens."""
@ -38,6 +43,14 @@ def run_benchmark(
    if kv_cache_dtype == "fp8" and head_size % 16:
        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")

+    if implementation not in ("cuda", "triton"):
+        raise ValueError(
+            f"Unsupported implementation: {implementation}. "
+            "Only 'cuda' and 'triton' are supported."
+        )
+    if implementation == "triton" and kv_cache_layout == "HND":
+        return float("nan")  # Triton does not support HND layout yet.
+
    current_platform.seed_everything(42)
    torch.set_default_device(device)

@ -65,27 +78,49 @@ def run_benchmark(
        cache_layout=kv_cache_layout,
    )
    key_cache, value_cache = key_caches[0], value_caches[0]
+    # to free unused memory
+    del key_caches, value_caches

    # compute per-kernel scaling factors for fp8 conversion (if used).
    k_scale = (key.amax() / 64.0).to(torch.float32)
    v_scale = (value.amax() / 64.0).to(torch.float32)

+    if implementation == "cuda":
+        function_under_test = lambda: ops.reshape_and_cache_flash(
+            key,  # noqa: F821
+            value,  # noqa: F821
+            key_cache,  # noqa: F821
+            value_cache,  # noqa: F821
+            slot_mapping,  # noqa: F821
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+    else:
+        function_under_test = lambda: triton_reshape_and_cache_flash(
+            key,  # noqa: F821
+            value,  # noqa: F821
+            key_cache,  # noqa: F821
+            value_cache,  # noqa: F821
+            slot_mapping,  # noqa: F821
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+    if benchmark_mode == "cudagraph":
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            function_under_test()
+        torch.cuda.synchronize()
+        function_under_test = lambda: g.replay()
+
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
        torch.cuda.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                slot_mapping,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-            )
-        torch.cuda.synchronize()
+            function_under_test()
+            torch.cuda.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters

@ -116,10 +151,16 @@ def main(args):
                kv_cache_dtype=args.kv_cache_dtype,
                kv_cache_layout=layout,
                num_iters=args.iters,
+                implementation=args.implementation,
+                benchmark_mode=args.mode,
                device="cuda",
            )
            rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])

+    print(
+        f"Benchmark results for implementation {args.implementation}"
+        f" (measuring with {args.mode}):"
+    )
    print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))


@ -151,6 +192,21 @@ if __name__ == "__main__":
    )

    parser.add_argument("--iters", type=int, default=100)
+
+    parser.add_argument(
+        "--implementation",
+        type=str,
+        choices=["cuda", "triton"],
+        default="cuda",
+    )
+
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["cudagraph", "no_graph"],
+        default="cudagraph",
+    )
+
    args = parser.parse_args()

    main(args)
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -11,13 +11,13 @@ from datetime import datetime
 from typing import Any

 import torch
-import triton
 from tqdm import tqdm

 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    _w8a8_block_fp8_matmul,
 )
 from vllm.platforms import current_platform
+from vllm.triton_utils import triton
 from vllm.utils import FlexibleArgumentParser

 mp.set_start_method("spawn", force=True)
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -10,7 +10,7 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    get_col_major_tma_aligned_tensor,
    per_token_group_quant_fp8,
-    w8a8_block_fp8_matmul,
+    w8a8_triton_block_scaled_mm,
 )
 from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
@ -59,7 +59,7 @@ def benchmark_shape(m: int,

    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        return w8a8_block_fp8_matmul(A_vllm,
+        return w8a8_triton_block_scaled_mm(A_vllm,
                                     B_vllm,
                                     A_scale_vllm,
                                     B_scale_vllm,
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -258,7 +258,8 @@ set(VLLM_EXT_SRC
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/mla_decode.cpp"
    "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp")
+    "csrc/cpu/torch_bindings.cpp"
+    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -135,10 +135,10 @@ public:
    max_splits = min(16, max_splits);

    // TODO: This avoids a hang when the batch size larger than 1 and 
-    // there is more than 4 kv_splits. 
+    // there is more than 1 kv_splits. 
    // Discuss with NVIDIA how this can be fixed.
    if (B > 1) {
-      max_splits = min(2, max_splits);
+      max_splits = min(1, max_splits);
    }
    
    // printf("    max_splits = %d\n", max_splits);
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -88,8 +88,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "    int tp_rank, int blocksparse_local_blocks,"
      "    int blocksparse_vert_stride, int blocksparse_block_size,"
      "    int blocksparse_head_sliding_step) -> ()");
+
  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);

+  ops.def(
+      "dynamic_4bit_int_moe("
+      "Tensor x, Tensor topk_ids, Tensor topk_weights,"
+      "Tensor w13_packed, Tensor w2_packed, int H, int I, int I2,"
+      "int group_size, bool apply_router_weight_on_input, int activation_kind"
+      ") -> Tensor");
+
+  ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);
+
  // PagedAttention V2.
  ops.def(
      "paged_attention_v2("
--- a/csrc/launch_bounds_utils.h
+++ b/csrc/launch_bounds_utils.h
@ -0,0 +1,38 @@
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include <algorithm>
+
+// maximum blocks per SM cap
+#ifndef VLLM_LAUNCH_BLOCKS_CAP
+  #define VLLM_LAUNCH_BLOCKS_CAP 4
+#endif
+
+// compile-time estimate of max threads per SM for launch bounds.
+#ifndef VLLM_MAX_THREADS_PER_SM
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+    #define VLLM_MAX_THREADS_PER_SM 1536
+  #else
+    #define VLLM_MAX_THREADS_PER_SM 2048
+  #endif
+#endif
+
+// compute the number of blocks per SM to request in __launch_bounds__
+#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL))
+#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \
+  (((VAL) <= 0)                       \
+       ? 1                            \
+       : (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP))
+#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \
+  VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS))
+
+// runtime-time helper to compute blocks/SM
+static inline int vllm_runtime_blocks_per_sm(int block_threads) {
+  int device = -1;
+  cudaGetDevice(&device);
+  int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM;
+  cudaDeviceGetAttribute(&max_threads_per_sm,
+                         cudaDevAttrMaxThreadsPerMultiProcessor, device);
+  int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1;
+  return VLLM_CLAMP_BLOCKS_PER_SM(blocks);
+}
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@ -0,0 +1,156 @@
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <torch/all.h>
+
+// _dyn_quant_matmul_4bit is only available on AArch64.
+#if defined(__aarch64__)
+  #include <ATen/ops/_dyn_quant_matmul_4bit.h>
+#endif
+
+inline torch::Tensor mm(const torch::Tensor& a, const torch::Tensor& packed_w,
+                        int64_t group_size_eff, int64_t in_features,
+                        int64_t out_features) {
+#if defined(__aarch64__)
+  return at::_ops::_dyn_quant_matmul_4bit::call(a, packed_w, group_size_eff,
+                                                in_features, out_features);
+#else
+  TORCH_CHECK(false,
+              "dynamic 4-bit int MoE path requires AArch64 (ARM64); "
+              "_dyn_quant_matmul_4bit is unavailable on this architecture");
+  return {};
+#endif
+}
+
+enum ActivationKind : int64_t {
+  SwiGLU_Gu = 0,  // act = SiLU(g) * u
+  SwiGLUOAI = 1,  // act = SiLU(u) * g
+  SiLU = 2        // SiLU
+};
+
+torch::Tensor dynamic_4bit_int_moe_cpu(
+    torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
+    torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
+    int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
+    int64_t activation_kind) {
+  TORCH_CHECK(x.dim() == 2, "x must be 2D");
+  TORCH_CHECK(topk_ids.dim() == 2 && topk_weights.dim() == 2,
+              "topk tensors must be [T, K]");
+  TORCH_CHECK(
+      w13_packed.size(0) == w2_packed.size(0),
+      "w13_packed and w2_packed must have same number of experts in dim 0");
+  TORCH_CHECK(I2 == 2 * I, "I2 must equal 2*I");
+
+  const int64_t T = x.size(0);
+  const int64_t K = topk_ids.size(1);
+  const int64_t E = w13_packed.size(0);
+  const int64_t N = T * K;
+
+  auto x_c = x.contiguous();
+  auto ids_c = topk_ids.contiguous();
+  auto gates_c = topk_weights.to(at::kFloat).contiguous();
+
+  // bucketing tokens -> experts
+  c10::SmallVector<int64_t, 64> counts(
+      E, 0);  // Small vector uses stack allocation
+  {
+    const auto* ids_ptr = ids_c.data_ptr<int64_t>();
+    for (int64_t i = 0; i < N; ++i) {
+      const int64_t e_id = ids_ptr[i];
+      TORCH_CHECK(0 <= e_id && e_id < E, "expert id out of range");
+      counts[e_id]++;
+    }
+  }
+  c10::SmallVector<int64_t, 65> offsets(E + 1, 0);  // ( E +1 )
+  for (int64_t e = 0; e < E; ++e) offsets[e + 1] = offsets[e] + counts[e];
+
+  auto expert_tokens = at::empty({offsets[E]}, ids_c.options());
+  auto expert_gates = at::empty({offsets[E]}, gates_c.options());
+  {
+    c10::SmallVector<int64_t, 64> cursor(E, 0);
+    const auto* ids_ptr = ids_c.data_ptr<int64_t>();
+    const auto* gts_ptr = gates_c.data_ptr<float>();
+    auto* tok_ptr = expert_tokens.data_ptr<int64_t>();
+    auto* gate_ptr = expert_gates.data_ptr<float>();
+
+    for (int64_t t = 0; t < T; ++t) {
+      const int64_t base = t * K;
+      for (int64_t k = 0; k < K; ++k) {
+        const int64_t idx = base + k;
+        const int64_t e = ids_ptr[idx];
+        const int64_t p = offsets[e] + (cursor[e]++);
+        tok_ptr[p] = t;
+        gate_ptr[p] = gts_ptr[idx];
+      }
+    }
+  }
+
+  const int64_t g_eff_13 = (group_size != -1) ? group_size : H;
+  const int64_t g_eff_2 = (group_size != -1) ? group_size : I;
+
+  // Per-expert outputs filled in parallel
+  std::vector<torch::Tensor> y_list(E);
+  y_list.resize(E);
+
+  at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
+    for (int64_t e = e_begin; e < e_end; ++e) {
+      const int64_t te = counts[e];
+      if (te == 0) {
+        y_list[e] = at::empty({0, H}, x_c.options());
+        continue;
+      }
+
+      const int64_t start = offsets[e];
+
+      auto sel_tokens =
+          expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
+      auto gates_e =
+          expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
+
+      auto x_e = x_c.index_select(/*dim=*/0, sel_tokens);
+
+      if (apply_router_weight_on_input) {
+        x_e = x_e.mul(gates_e.unsqueeze(1));
+      }
+
+      auto w13_e = w13_packed.select(/*dim=*/0, e);
+      auto w2_e = w2_packed.select(/*dim=*/0, e);
+
+      // W13
+      auto y13 =
+          mm(x_e, w13_e, g_eff_13, /*in_features=*/H, /*out_features=*/I2);
+
+      auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I);
+      auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I);
+
+      torch::Tensor act;
+      if (activation_kind == ActivationKind::SwiGLUOAI) {  // SwiGLUOAI
+        constexpr double kAlpha = 1.702;                   // GPT-OSS default
+        constexpr double kLimit = 7.0;                     // GPT-OSS default
+        auto gate_c = at::clamp_max(g_part, kLimit);
+        auto up_c = at::clamp(u_part, -kLimit, kLimit);
+        auto glu = gate_c.mul(at::sigmoid(gate_c.mul(kAlpha)));
+        act = up_c.add(1.0).mul(glu);
+      } else {  // SiLU , SwiGLU_GU, vLLM maps silu to SiluAndMul()
+        act = at::silu(g_part).mul(u_part);
+      }
+
+      // W2
+      auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H);
+
+      if (!apply_router_weight_on_input) {
+        y = y.mul(gates_e.unsqueeze(1));
+      }
+
+      // Store per-expert result
+      y_list[e] = y;
+    }
+  });
+
+  // Concatenate all expert outputs to match expert_tokens order
+  auto Y_all = at::cat(y_list, /*dim=*/0);
+  auto out = at::zeros({T, H}, x.options());
+  out =
+      at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all);
+
+  return out;
+}
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@ -418,6 +418,15 @@ __device__ inline T neg_inf() {
  return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
 }

+template <typename T>
+__device__ inline bool is_finite(const T val) {
+#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
+  return cuda::std::isfinite(val);
+#else
+  return isfinite(cuda_cast<float, T>(val));
+#endif
+}
+
 template <typename T>
 __device__ void topk_with_k2(T* output, T const* input,
                             cg::thread_block_tile<32> const& tile,
@ -533,7 +542,7 @@ __global__ void group_idx_and_topk_idx_kernel(
    // calculate group_idx
    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
    // The check is necessary to avoid abnormal input
-    if (lane_id < n_group && cuda::std::isfinite(group_scores[lane_id])) {
+    if (lane_id < n_group && is_finite(group_scores[lane_id])) {
      value = group_scores[lane_id];
    }

@ -568,11 +577,10 @@ __global__ void group_idx_and_topk_idx_kernel(
        int32_t offset = i_group * num_experts_per_group;
        for (int32_t i = lane_id; i < align_num_experts_per_group;
             i += WARP_SIZE) {
-          T candidates =
-              (i < num_experts_per_group) &&
-                      cuda::std::isfinite(scores_with_bias[offset + i])
-                  ? scores_with_bias[offset + i]
-                  : neg_inf<T>();
+          T candidates = (i < num_experts_per_group) &&
+                                 is_finite(scores_with_bias[offset + i])
+                             ? scores_with_bias[offset + i]
+                             : neg_inf<T>();
          queue.add(candidates, offset + i);
        }
        if (group_scores[i_group] == topk_group_value) {
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -328,6 +328,12 @@ void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
                        const std::optional<torch::Tensor>& has_initial_state,
                        const torch::Tensor& ssm_states, int64_t pad_slot_id);

+torch::Tensor dynamic_4bit_int_moe_cpu(
+    torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
+    torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
+    int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
+    int64_t activation_kind);
+
 using fptr_t = int64_t;
 fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
                      torch::Tensor& rank_data, int64_t rank,
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -23,9 +23,14 @@
 typedef __hip_bfloat162 __nv_bfloat162;
 typedef __hip_bfloat16 __nv_bfloat16;
 typedef __hip_bfloat16_raw __nv_bfloat16_raw;
-
+  #if defined(HIP_FP8_TYPE_OCP)
 typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
 typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
+  #else
+// ROCm 6.2 fallback: only *_fnuz types exist
+typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3;
+typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3;
+  #endif
 #endif

 #include "core/registration.h"
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@ -26,6 +26,7 @@
 #include "dispatch_utils.h"

 #include "cuda_utils.h"
+#include "launch_bounds_utils.h"
 #include "nvfp4_utils.cuh"

 namespace vllm {
@ -63,7 +64,7 @@ __inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,

 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(1024, 4)
+__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
    silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                             float const* SFScale, uint32_t* out,
                             uint32_t* SFout) {
@ -131,7 +132,8 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
-  int const numBlocksPerSM = 2048 / block.x;
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));

  VLLM_DISPATCH_HALF_TYPES(
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@ -26,12 +26,13 @@
 #include "dispatch_utils.h"

 #include "nvfp4_utils.cuh"
+#include "launch_bounds_utils.h"

 namespace vllm {

 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void __launch_bounds__(512, 4)
+__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                    float const* SFScale, uint32_t* out, uint32_t* SFout,
                    uint32_t* input_offset_by_experts,
@ -129,7 +130,7 @@ __global__ void __launch_bounds__(512, 4)

 // Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void __launch_bounds__(1024, 4)
+__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                    float const* SFScale, uint32_t* out, uint32_t* SFout,
                    uint32_t* input_offset_by_experts,
@ -233,8 +234,9 @@ void quant_impl(void* output, void* output_scale, void* input,
  int const workSizePerRow = k / ELTS_PER_THREAD;
  int const totalWorkSize = m_topk * workSizePerRow;
  dim3 block(std::min(workSizePerRow, 512));
-  // Get number of blocks per SM (assume we can fully utilize the SM).
-  int const numBlocksPerSM = 2048 / block.x;
+  // Get number of blocks per SM
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
  dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
                     multiProcessorCount * numBlocksPerSM));
  while (grid.x <= multiProcessorCount && block.x > 64) {
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@ -26,13 +26,14 @@
 #include "dispatch_utils.h"

 #include "cuda_utils.h"
+#include "launch_bounds_utils.h"
 #include "nvfp4_utils.cuh"

 namespace vllm {

 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(512, 4)
+__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                    float const* SFScale, uint32_t* out, uint32_t* SFout) {
  using PackedVec = PackedVec<Type>;
@ -75,8 +76,9 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
  // Grid, Block size.
  // Each thread converts 8 values.
  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
-  // Get number of blocks per SM (assume we can fully utilize the SM).
-  int const numBlocksPerSM = 2048 / block.x;
+  // Get number of blocks per SM
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));

  // Launch the cvt kernel.
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/fp8/per_token_group_quant.cu
@ -12,8 +12,8 @@
 #include "../vectorization_utils.cuh"
 #include "../../dispatch_utils.h"

-__device__ __forceinline__ float GroupReduceMax(float val, const int tid) {
-  unsigned mask = 0xffff;
+__device__ __forceinline__ float GroupReduceMax(float val) {
+  unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;

  val = fmaxf(val, __shfl_xor_sync(mask, val, 8));
  val = fmaxf(val, __shfl_xor_sync(mask, val, 4));
@ -86,7 +86,7 @@ __global__ void per_token_group_quant_8bit_kernel(
      threads_per_group,  // stride in group
      scalar_op_cache);   // scalar handler

-  local_absmax = GroupReduceMax(local_absmax, lane_id);
+  local_absmax = GroupReduceMax(local_absmax);

  float y_s = local_absmax / max_8bit;
  if constexpr (SCALE_UE8M0) {
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -25,6 +25,12 @@
 #include "../attention/dtype_fp8.cuh"
 #include "../quantization/fp8/amd/quant_utils.cuh"

+// ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent
+#if !defined(HIP_FP8_TYPE_OCP)
+using __hip_fp8_e4m3 = __hip_fp8_e4m3_fnuz;
+using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz;
+#endif
+
 #if defined(__HIPCC__) && \
    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
  #define __HIP__GFX9__
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@ -5,11 +5,14 @@
 torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
                    const int64_t rows_per_block);

-torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
+torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
+                       const c10::optional<at::Tensor>& in_bias,
                       const int64_t CuCount);

-void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
-               at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount);
+void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
+               const c10::optional<at::Tensor>& in_bias, at::Tensor& out_c,
+               const at::Tensor& scale_a, const at::Tensor& scale_b,
+               const int64_t CuCount);

 void paged_attention(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@ -292,8 +292,9 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
-                     const scalar_t* __restrict__ A, scalar_t* C,
+    wvSplitK_hf_sml_(const int K, const int M, const int Bx, const int By,
+                     const scalar_t* B, const scalar_t* __restrict__ A,
+                     const scalar_t* __restrict__ BIAS, scalar_t* C,
                     const int _WvPrGrp, const int CuCount) {
  constexpr int max_lds_len = LDS_SIZE / 2;
  #if defined(__HIP__MI3XX__)
@ -484,7 +485,14 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            // if (commitColumn[i]) C[m + i + n * M] = __float2half(sum[n][i]);
+            if constexpr (std::is_same_v<scalar_t, half>) {
+              if (BIAS)
+                sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
+            } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+              if (BIAS)
+                sum[n][i] +=
+                    __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
+            }
            C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
          }
        }
@ -529,7 +537,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
+            if (BIAS)
+              sum4[n][i][0] +=
+                  __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
            C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
          }
        }
@ -541,8 +551,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
-__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
-                                 const scalar_t* __restrict__ A, scalar_t* C,
+__global__ void wvSplitK_hf_sml_(const int K, const int M, const int Bx,
+                                 const int By, const scalar_t* B,
+                                 const scalar_t* __restrict__ A,
+                                 const scalar_t* __restrict__ BIAS, scalar_t* C,
                                 const int _WvPrGrp, const int CuCount) {
  UNREACHABLE_CODE
 }
@ -553,8 +565,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitK_hf_(const int K, const int M, const scalar_t* B,
-                 const scalar_t* __restrict__ A, scalar_t* C,
+    wvSplitK_hf_(const int K, const int M, const int Bx, const int By,
+                 const scalar_t* B, const scalar_t* __restrict__ A,
+                 const scalar_t* __restrict__ BIAS, scalar_t* C,
                 const int _WvPrGrp, const int CuCount) {
  constexpr int max_lds_len = LDS_SIZE / 2;
  #if defined(__HIP__MI3XX__)
@ -772,8 +785,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            if (commitColumn[i])
+            if (commitColumn[i]) {
+              if constexpr (std::is_same_v<scalar_t, half>) {
+                if (BIAS)
+                  sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
+              } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+                if (BIAS)
+                  sum[n][i] +=
+                      __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
+              }
              C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+            }
          }
        }
      }
@ -818,8 +840,12 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
-            C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
+            if (commitColumn[i]) {
+              if (BIAS)
+                sum4[n][i][0] +=
+                    __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
+              C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
+            }
          }
        }
      }
@ -842,8 +868,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
-__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
-                             const scalar_t* __restrict__ A, scalar_t* C,
+__global__ void wvSplitK_hf_(const int K, const int M, const int Bx,
+                             const int By, const scalar_t* B,
+                             const scalar_t* __restrict__ A,
+                             const scalar_t* __restrict__ BIAS, scalar_t* C,
                             const int _WvPrGrp, const int CuCount) {
  UNREACHABLE_CODE
 }
@ -854,8 +882,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
-                     const scalar_t* __restrict__ A, scalar_t* C,
+    wvSplitK_hf_big_(const int K, const int M, const int Bx, const int By,
+                     const scalar_t* B, const scalar_t* __restrict__ A,
+                     const scalar_t* __restrict__ BIAS, scalar_t* C,
                     const int _WvPrGrp, const int CuCount) {
  constexpr int max_lds_len = LDS_SIZE / 2;
  #if defined(__HIP__MI3XX__)
@ -1124,8 +1153,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            if (commitColumn[i])
+            if (commitColumn[i]) {
+              if constexpr (std::is_same_v<scalar_t, half>) {
+                if (BIAS)
+                  sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
+              } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+                if (BIAS)
+                  sum[n][i] +=
+                      __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
+              }
              C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+            }
          }
        }
      }
@ -1166,8 +1204,12 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (threadIdx.x == 63) {
        for (int n = 0; n < N; n++) {
          for (int i = 0; i < YTILE; i++) {
-            // if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
-            C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
+            if (commitColumn[i]) {
+              if (BIAS)
+                sum4[n][i][0] +=
+                    __bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
+              C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
+            }
          }
        }
      }
@ -1190,8 +1232,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
          int UNRL, int N>
-__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
-                                 const scalar_t* __restrict__ A, scalar_t* C,
+__global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
+                                 const int By, const scalar_t* B,
+                                 const scalar_t* __restrict__ A,
+                                 const scalar_t* __restrict__ BIAS, scalar_t* C,
                                 const int _WvPrGrp, const int CuCount) {
  UNREACHABLE_CODE
 }
@ -1226,11 +1270,20 @@ int mindiv(int N, int div1, int div2) {
  return rtn;
 }

-torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
+torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
+                       const c10::optional<at::Tensor>& in_bias,
                       const int64_t CuCount) {
  auto M_in = in_a.size(0);
  auto K_in = in_a.size(1);
  auto N_in = in_b.size(0);
+  auto Bx_in =
+      (in_bias.has_value() && in_bias->numel() > 0)
+          ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
+          : 1;
+  auto By_in = (in_bias.has_value() && in_bias->numel() > 0 &&
+                in_bias->sizes().size() == 2)
+                   ? in_bias->size(0)
+                   : 1;

  TORCH_CHECK(in_a.dtype() == in_b.dtype());
  TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0");
@ -1254,18 +1307,18 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {              \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
-                                       CuCount);                              \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
+                                       biasf4, c, __wvPrGrp, CuCount);        \
    } else if (K_in * N_in <= max_lds_len * 1.2) {                            \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
-          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
-                                       CuCount);                              \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
+                                       biasf4, c, __wvPrGrp, CuCount);        \
    } else {                                                                  \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
-                                       CuCount);                              \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
+                                       biasf4, c, __wvPrGrp, CuCount);        \
    }                                                                         \
  }

@ -1273,6 +1326,10 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
    using fptype = typename scalar<scalar_t>::type;
    fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
+    const fptype* biasf4 =
+        (in_bias.has_value() && in_bias->numel() > 0)
+            ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
+            : nullptr;
    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
    switch (N_in) {
      case 1:
@ -1300,8 +1357,9 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const fp8_t* B,
-                      const fp8_t* __restrict__ A, scalar_t* C,
+    wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const int Bx,
+                      const int By, const fp8_t* B, const fp8_t* __restrict__ A,
+                      const scalar_t* __restrict__ BIAS, scalar_t* C,
                      const float* __restrict__ s_A,
                      const float* __restrict__ s_B, const int _WvPrGrp,
                      const int CuCount) {
@ -1453,7 +1511,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    if (threadIdx.x == 0) {
      for (int n = 0; n < N; n++) {
        for (int y = 0; y < YTILE; y++) {
-          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+          if (y + m >= M) break;  // To avoid mem access fault.
+          sum[n][y][0] *= sA * sB;
+          if constexpr (std::is_same_v<scalar_t, half>) {
+            if (BIAS)
+              sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]);
+          } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+            if (BIAS)
+              sum[n][y][0] +=
+                  __bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]);
+          }
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);  // * sA * sB);
        }
      }
    }
@ -1465,7 +1533,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
-                                  const fp8_t* B, const fp8_t* __restrict__ A,
+                                  const int Bx, const int By, const fp8_t* B,
+                                  const fp8_t* __restrict__ A,
+                                  const scalar_t* __restrict__ BIAS,
                                  scalar_t* C, const float* __restrict__ s_A,
                                  const float* __restrict__ s_B,
                                  const int _WvPrGrp, const int CuCount) {
@ -1477,8 +1547,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitKQ_hf_(const int K, const int Kp, const int M, const fp8_t* B,
-                  const fp8_t* __restrict__ A, scalar_t* C,
+    wvSplitKQ_hf_(const int K, const int Kp, const int M, const int Bx,
+                  const int By, const fp8_t* B, const fp8_t* __restrict__ A,
+                  const scalar_t* __restrict__ BIAS, scalar_t* C,
                  const float* __restrict__ s_A, const float* __restrict__ s_B,
                  const int _WvPrGrp, const int CuCount) {
  constexpr int max_lds_len = LDS_SIZE;
@ -1626,7 +1697,16 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      for (int n = 0; n < N; n++) {
        for (int y = 0; y < YTILE; y++) {
          if (y + m >= M) break;  // To avoid mem access fault.
-          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+          sum[n][y][0] *= sA * sB;
+          if constexpr (std::is_same_v<scalar_t, half>) {
+            if (BIAS)
+              sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]);
+          } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+            if (BIAS)
+              sum[n][y][0] +=
+                  __bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]);
+          }
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
        }
      }
    }
@ -1638,16 +1718,19 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
-                              const fp8_t* B, const fp8_t* __restrict__ A,
-                              scalar_t* C, const float* __restrict__ s_A,
+                              const int Bx, const int By, const fp8_t* B,
+                              const fp8_t* __restrict__ A,
+                              const scalar_t* __restrict__ BIAS, scalar_t* C,
+                              const float* __restrict__ s_A,
                              const float* __restrict__ s_B, const int _WvPrGrp,
                              const int CuCount) {
  UNREACHABLE_CODE
 }
 #endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support

-void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
-               at::Tensor& scale_a, at::Tensor& scale_b,
+void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
+               const c10::optional<at::Tensor>& in_bias, at::Tensor& out_c,
+               const at::Tensor& scale_a, const at::Tensor& scale_b,
               const int64_t CuCount) {
  static c10::ScalarType kFp8Type = is_fp8_ocp()
                                        ? c10::ScalarType::Float8_e4m3fn
@ -1656,6 +1739,15 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
  auto K_in = in_a.size(1);
  auto N_in = in_b.size(0);
  auto Kp_in = in_a.stride(0);
+  auto Bx_in =
+      (in_bias.has_value() && in_bias->numel() > 0)
+          ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
+          : 1;
+  auto By_in = (in_bias.has_value() && in_bias->numel() > 0 &&
+                in_bias->sizes().size() == 2)
+                   ? in_bias->size(0)
+                   : 1;
+
  TORCH_CHECK(K_in % 16 == 0, "k % 16 == 0");
  TORCH_CHECK(in_a.dtype() == in_b.dtype() && in_a.dtype() == kFp8Type);
  TORCH_CHECK(out_c.dtype() == torch::kFloat16 ||
@ -1673,13 +1765,15 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {               \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);               \
      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>  \
-          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
-                                       s_a, s_b, __wvPrGrp, CuCount);          \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \
+                                       b_ptr, bias_ptr, c_ptr, s_a, s_b,       \
+                                       __wvPrGrp, CuCount);                    \
    } else {                                                                   \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);               \
      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>      \
-          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
-                                       s_a, s_b, __wvPrGrp, CuCount);          \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \
+                                       b_ptr, bias_ptr, c_ptr, s_a, s_b,       \
+                                       __wvPrGrp, CuCount);                    \
    }                                                                          \
  }

@ -1691,6 +1785,9 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
    VLLM_DISPATCH_FP8_TYPES(in_a.scalar_type(), "wvSplitKQ", [&] {
      auto a_ptr = in_a.data_ptr<fp8_t>();
      auto b_ptr = in_b.data_ptr<fp8_t>();
+      auto bias_ptr = (in_bias.has_value() && in_bias->numel() > 0)
+                          ? reinterpret_cast<fptype*>(in_bias->data_ptr())
+                          : nullptr;
      switch (N_in) {
        case 1:
          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1)
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@ -22,13 +22,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {

  // Custom gemm op for skinny matrix-matrix multiplication
  rocm_ops.def(
-      "wvSplitK(Tensor in_a, Tensor in_b, int CuCount) -> "
+      "wvSplitK(Tensor in_a, Tensor in_b, Tensor? in_bias, int CuCount) -> "
      "Tensor");
  rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);

  // wvSplitK for fp8
  rocm_ops.def(
-      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor! out_c, Tensor scale_a, "
+      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor? in_bias, Tensor! out_c, "
+      "Tensor scale_a, "
      "          Tensor scale_b, int CuCount) -> ()");
  rocm_ops.impl("wvSplitKQ", torch::kCUDA, &wvSplitKQ);

--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -65,8 +65,6 @@ ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
-ARG FA_BRANCH
-ARG FA_REPO
 RUN git clone ${PYTORCH_REPO} pytorch
 RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
    pip install -r requirements.txt && git submodule update --init --recursive \
@ -77,14 +75,20 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && pip install dist/*.whl
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
+    && cp /app/vision/dist/*.whl /app/install
+
+FROM base AS build_fa
+ARG FA_BRANCH
+ARG FA_REPO
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
 RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
-RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
-    && cp /app/vision/dist/*.whl /app/install \
-    && cp /app/flash-attention/dist/*.whl /app/install
+RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install

 FROM base AS build_aiter
 ARG AITER_BRANCH
@ -103,6 +107,8 @@ FROM base AS debs
 RUN mkdir /app/debs
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
@ -111,13 +117,7 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs

 FROM base AS final
-RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
-    pip install /install/*.whl
-RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
-    pip install /install/*.whl
-RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
-    pip install /install/*.whl
-RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
    pip install /install/*.whl

 ARG BASE_IMAGE
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -680,7 +680,7 @@ vllm bench serve \
  --save-result \
  --result-dir ~/vllm_benchmark_results \
  --save-detailed \
-  --endpoint /v1/chat/completion
+  --endpoint /v1/chat/completions
 ```

 ##### Videos (ShareGPT4Video)
@ -707,7 +707,7 @@ vllm bench serve \
  --save-result \
  --result-dir ~/vllm_benchmark_results \
  --save-detailed \
-  --endpoint /v1/chat/completion
+  --endpoint /v1/chat/completions
 ```

 ##### Synthetic Random Images (random-mm)
--- a/docs/features/README.md
+++ b/docs/features/README.md
@ -36,22 +36,23 @@ th:not(:first-child) {
 }
 </style>

-| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
-|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
-| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | |
-| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | |
-| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | |
-| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | |
-| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
-| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
-| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
-| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
-| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
-| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
-| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
-| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
+| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
+| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
+| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
+| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
+| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
+| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
+| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
+| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
+| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
+| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
+| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
+| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
+| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ |

 \* Chunked prefill and prefix caching are only applicable to last-token pooling.  
 <sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
@ -76,3 +77,4 @@ th:not(:first-child) {
 | multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     | ❌ |
 | best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ?     | [❌](gh-issue:25097) |
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@ -23,7 +23,7 @@ Now supports 5 types of connectors:

 - **SharedStorageConnector**: refer to <gh-file:examples/offline_inference/disaggregated-prefill-v1/run.sh> for the example usage of SharedStorageConnector disaggregated prefilling.
 - **LMCacheConnectorV1**: refer to <gh-file:examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh> for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv.
+- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
 - **P2pNcclConnector**: refer to <gh-file:examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh> for the example usage of P2pNcclConnector disaggregated prefilling.
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:

@ -31,6 +31,18 @@ Now supports 5 types of connectors:
  --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
  ```

+For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
+
+  ```bash
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_buffer_device":"cuda", "kv_connector_extra_config":{"backends":["UCX", "GDS"]}}'
+  ```
+
+- **OffloadingConnector**: enable offloading of KV data to CPU memory, customizing the CPU block size (in tokens) and number of blocks to allocate (per worker):
+
+  ```bash
+  --kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 64, "num_cpu_blocks": 1000}}'
+  ```
+
 ## Benchmarks

 Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@ -0,0 +1,159 @@
+# NixlConnector Usage Guide
+
+NixlConnector is a high-performance KV cache transfer connector for vLLM's disaggregated prefilling feature. It provides fully asynchronous send/receive operations using the NIXL library for efficient cross-process KV cache transfer.
+
+## Prerequisites
+
+### Installation
+
+Install the NIXL library: `uv pip install nixl`, as a quick start.
+
+- Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
+- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
+
+### Transport Configuration
+
+NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:
+
+```bash
+# Example UCX configuration, adjust according to your enviroment
+export UCX_TLS=all  # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
+export UCX_NET_DEVICES=all  # or specify network devices like "mlx5_0:1,mlx5_1:1"
+```
+
+!!! tip
+    When using UCX as the transport backend, NCCL environment variables (like `NCCL_IB_HCA`, `NCCL_SOCKET_IFNAME`) are not applicable to NixlConnector, so configure UCX-specific environment variables instead of NCCL variables.
+
+## Basic Usage (on the same host)
+
+### Producer (Prefiller) Configuration
+
+Start a prefiller instance that produces KV caches
+
+```bash
+# 1st GPU as prefiller
+CUDA_VISIBLE_DEVICES=0 \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
+vllm serve Qwen/Qwen3-0.6B \
+  --port 8100 \
+  --enforce-eager \
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+```
+
+### Consumer (Decoder) Configuration
+
+Start a decoder instance that consumes KV caches:
+
+```bash
+# 2nd GPU as decoder
+CUDA_VISIBLE_DEVICES=1 \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5601 \
+vllm serve Qwen/Qwen3-0.6B \
+  --port 8200 \
+  --enforce-eager \
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+```
+
+### Proxy Server
+
+Use a proxy server to route requests between prefiller and decoder:
+
+```bash
+python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
+  --port 8192 \
+  --prefiller-hosts localhost \
+  --prefiller-ports 8100 \
+  --decoder-hosts localhost \
+  --decoder-ports 8200
+```
+
+## Environment Variables
+
+- `VLLM_NIXL_SIDE_CHANNEL_PORT`: Port for NIXL handshake communication
+    - Default: 5600
+    - **Required for both prefiller and decoder instances**
+    - Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
+    - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank (e.g., with `--tensor-parallel-size=4` and base_port=5600, tp_rank 0..3 use ports 5600, 5601, 5602, 5603 on that node).
+    - Used for the initial NIXL handshake between the prefiller and the decoder
+
+- `VLLM_NIXL_SIDE_CHANNEL_HOST`: Host for side channel communication
+    - Default: "localhost"
+    - Set when prefiller and decoder are on different machines
+    - Connection info is passed via KVTransferParams from prefiller to decoder for handshake
+
+- `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
+    - Default: 120
+    - If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
+
+## Multi-Instance Setup
+
+### Multiple Prefiller Instances on Different Machines
+
+```bash
+# Prefiller 1 on Machine A (example IP: ${IP1})
+VLLM_NIXL_SIDE_CHANNEL_HOST=${IP1} \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
+UCX_NET_DEVICES=all \
+vllm serve Qwen/Qwen3-0.6B --port 8000 \
+  --tensor-parallel-size 8 \
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}'
+
+# Prefiller 2 on Machine B (example IP: ${IP2})
+VLLM_NIXL_SIDE_CHANNEL_HOST=${IP2} \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
+UCX_NET_DEVICES=all \
+vllm serve Qwen/Qwen3-0.6B --port 8000 \
+  --tensor-parallel-size 8 \
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}'
+```
+
+### Multiple Decoder Instances on Different Machines
+
+```bash
+# Decoder 1 on Machine C (example IP: ${IP3})
+VLLM_NIXL_SIDE_CHANNEL_HOST=${IP3} \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
+UCX_NET_DEVICES=all \
+vllm serve Qwen/Qwen3-0.6B --port 8000 \
+  --tensor-parallel-size 8 \
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}'
+
+# Decoder 2 on Machine D (example IP: ${IP4})
+VLLM_NIXL_SIDE_CHANNEL_HOST=${IP4} \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
+UCX_NET_DEVICES=all \
+vllm serve Qwen/Qwen3-0.6B --port 8000 \
+  --tensor-parallel-size 8 \
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}'
+```
+
+### Proxy for Multiple Instances
+
+```bash
+python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
+  --port 8192 \
+  --prefiller-hosts ${IP1} ${IP2} \
+  --prefiller-ports 8000 8000 \
+  --decoder-hosts ${IP3} ${IP4} \
+  --decoder-ports 8000 8000
+```
+
+### KV Role Options
+
+- **kv_producer**: For prefiller instances that generate KV caches
+- **kv_consumer**: For decoder instances that consume KV caches from prefiller
+- **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined.
+
+!!! tip
+    NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`).
+    Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior.
+
+## Example Scripts/Code
+
+Refer to these example scripts in the vLLM repository:
+
+- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
+- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
+- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
--- a/docs/features/prompt_embeds.md
+++ b/docs/features/prompt_embeds.md
@ -6,9 +6,6 @@ This page teaches you how to pass prompt embedding inputs to vLLM.

 The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.

-!!! note
-    Prompt embeddings are currently only supported in the v0 engine.
-
 ## Offline Inference

 To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -319,6 +319,15 @@ Supported models:

 Flags: `--tool-call-parser glm45`

+### Qwen3-Coder Models (`qwen3_xml`)
+
+Supported models:
+
+* `Qwen/Qwen3-480B-A35B-Instruct`
+* `Qwen/Qwen3-Coder-30B-A3B-Instruct`
+
+Flags: `--tool-call-parser qwen3_xml`
+
 ### Models with Pythonic Tool Calls (`pythonic`)

 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@ -59,7 +59,7 @@ enabling the corresponding APIs:
 #### Predefined models

 If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
-you can override some of its attributes via the `--override-pooler-config` option.
+you can override some of its attributes via the `--pooler-config` option.

 #### Converted models

@ -75,7 +75,7 @@ the pooler assigned to each task has the following attributes by default:
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
 its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.

-You can further customize this via the `--override-pooler-config` option,
+You can further customize this via the `--pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.

 ## Offline Inference
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -17,9 +17,24 @@ These models are what we list in [supported-text-models][supported-text-models]

 ### Transformers

-vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases.
+vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".

-To check if the modeling backend is Transformers, you can simply do this:
+Currently, the Transformers backend works for the following:
+
+- Modalities: embedding models, language models and vision-language models*
+- Architectures: encoder-only, decoder-only
+- Attention types: full attention and/or sliding attention
+
+_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
+
+If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
+
+- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
+- Any combination of the following vLLM parallelisation schemes:
+    - Pipeline parallel
+    - Tensor parallel
+
+Checking if the modeling backend is Transformers is as simple as:

 ```python
 from vllm import LLM
@ -27,16 +42,12 @@ llm = LLM(model=...)  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
 ```

-If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers!
+If the printed type starts with `Transformers...` then it's using the Transformers model implementation!

-!!! tip
-    You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md).
+If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).

 !!! note
-    vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
-
-!!! note
-    In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
+    For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.

 #### Custom models

@ -66,10 +77,11 @@ This section details the necessary modifications to make to a Transformers compa
 To make your model compatible with the Transformers backend, it needs:

 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
+    1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`.
 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
 3. `MyModel` must contain `_supports_attention_backend = True`.

-<details>
+<details class="code">
 <summary>modeling_my_model.py</summary>

 ```python
@ -78,6 +90,7 @@ from transformers import PreTrainedModel
 from torch import nn

 class MyAttention(nn.Module):
+    is_causal = False  # Only do this for encoder-only models

    def forward(self, hidden_states, **kwargs):
        ...
@ -101,13 +114,13 @@ Here is what happens in the background when this model is loaded:

 1. The config is loaded.
 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
-3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
+3. `MyModel` is loaded into one of the Transformers backend classes in <gh-file:vllm/model_executor/models/transformers.py> which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.

 That's it!

 For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:

-<details>
+<details class="code">
 <summary>configuration_my_model.py</summary>

 ```python
@ -339,6 +352,7 @@ th {
 | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
+| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ |
 | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
 | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -457,7 +471,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A

 !!! note
    `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
-    You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
+    You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.

 !!! note
    For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
@ -552,7 +566,7 @@ If your model is not in the above list, we will try to automatically convert the

 !!! important
    For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
-    e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+    e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.

 #### Token Classification

--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@ -193,7 +193,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok

 1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip.

-2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`
+2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`. Noted, you may also specify one or multiple NIXL_Backend. Such as: `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_connector_extra_config":{"backends":["UCX", "GDS"]}}'`

 3. **Client Orchestration**: Use the client-side script below to coordinate prefill/decode operations. We are actively working on routing solutions.

--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@ -101,6 +101,13 @@ def parse_args():
        "--quantization",
        type=str,
    )
+    parser.add_argument(
+        "--disable-expert-parallel",
+        dest="enable_expert_parallel",
+        action="store_false",
+        help="Disable expert parallel (default: enabled).",
+    )
+    parser.set_defaults(enable_expert_parallel=True)
    return parser.parse_args()


@ -113,6 +120,7 @@ def main(
    dp_master_port,
    GPUs_per_dp_rank,
    enforce_eager,
+    enable_expert_parallel,
    trust_remote_code,
    max_num_seqs,
    max_model_len,
@ -168,7 +176,7 @@ def main(
        model=model,
        tensor_parallel_size=GPUs_per_dp_rank,
        enforce_eager=enforce_eager,
-        enable_expert_parallel=True,
+        enable_expert_parallel=enable_expert_parallel,
        trust_remote_code=trust_remote_code,
        max_num_seqs=max_num_seqs,
        max_model_len=max_model_len,
@ -229,6 +237,7 @@ if __name__ == "__main__":
                dp_master_port,
                tp_size,
                args.enforce_eager,
+                args.enable_expert_parallel,
                args.trust_remote_code,
                args.max_num_seqs,
                args.max_model_len,
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@ -1,510 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import inspect
-import json
-import os
-import sys
-from argparse import RawTextHelpFormatter
-from collections.abc import Generator
-from dataclasses import asdict, dataclass
-from typing import Any, Optional, TypeAlias
-
-import torch
-import tqdm
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.profiler.layerwise_profile import layerwise_profile
-from vllm.utils import FlexibleArgumentParser
-
-BATCH_SIZE_DEFAULT = 1
-PROMPT_LEN_DEFAULT = 256
-
-
-@dataclass
-class ProfileContext:
-    engine_args: EngineArgs
-    prompt_len: int
-    batch_size: int
-
-    # The profiler can run in 2 modes,
-    # 1. Run profiler for user specified num_steps
-    num_steps: Optional[int] = None
-    # 2. Run profiler until all requests complete
-    complete_num_requests_per_step: Optional[int] = None
-
-    save_chrome_traces_folder: Optional[str] = None
-
-
-def get_dtype(dtype: str):
-    if dtype == "torch.float":
-        return torch.float
-    else:
-        return dtype
-
-
-OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
-
-
-def compute_request_output_lengths(
-    batch_size: int, step_requests: list[int]
-) -> OutputLen_NumReqs_Map:
-    """
-    Given the number of requests, batch_size, and the number of requests
-    that each engine-step should process, step_requests, determine the
-    output lengths of the requests such that step_request is honoured.
-
-    Example:
-    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
-    then return,
-    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
-    32 requests should have output length 2,
-    32 requests should have output length 3,
-    32 requests should have output length 4,
-    31 requests should have output length 5,
-    1 request should have output length 6.
-
-    Args:
-        batch_size (int): Number of requests submitted for profile. This is
-            args.batch_size.
-        step_requests (list[int]): step_requests[i] is the number of requests
-            that the ith engine step should process.
-
-    Returns:
-        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
-            number of requests required to have that output-length as values.
-    """
-    ol_nr: OutputLen_NumReqs_Map = {}
-
-    # Number of request that are assigned an output-length
-    num_reqs_assigned: int = 0
-    num_steps: int = len(step_requests)
-
-    # sanity check. The first step (prefill-step), must process all requests.
-    assert step_requests[0] == batch_size
-
-    # Begin assignments from the last step.
-    output_length: int = num_steps
-    for num_requests_at_step in reversed(step_requests):
-        if num_reqs_assigned == batch_size:
-            break
-
-        assert num_reqs_assigned < batch_size
-
-        # Remove the number of requests that have been determined
-        # to participate in this step and beyond.
-        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
-        assert num_reqs_unassigned_at_step >= 0
-
-        if num_reqs_unassigned_at_step > 0:
-            ol_nr[output_length] = num_reqs_unassigned_at_step
-            num_reqs_assigned += num_reqs_unassigned_at_step
-
-        output_length -= 1
-
-    # sanity checks.
-    assert sum(ol_nr.values()) == batch_size, (
-        "Number of requests in output-length assignment does not match "
-        f"batch-size.\n batch size {batch_size} - "
-        f"step requests {step_requests} - assignments {ol_nr}"
-    )
-
-    # Check that the output-length is in [1, num-steps]. Output length must be
-    # at least 1 as all requests must participate in the prefill-step.
-    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
-        "Output lengths of requests should be in range "
-        f"[1, num-engine-steps].\n batch size {batch_size} - "
-        f"step requests {step_requests} - assignments {ol_nr}"
-    )
-
-    return ol_nr
-
-
-def determine_requests_per_step(context: ProfileContext) -> list[int]:
-    """
-    Determine number of requests each engine step should process.
-    If context.num_steps is set, then all engine steps process the
-    same number of requests and the output list is of length
-    context.num_steps.
-
-    If context.complete_num_requests_per_step is set, then each decode step
-    processes fewer and fewer requests until there are no requests to process.
-    In this case, the output list is as big as the number of steps
-    required to process all requests.
-
-    Args:
-        context: ProfileContext object.
-
-    Returns:
-        list[int]: Number of requests to process for all engine-steps.
-         output[i], contains the number of requests that the ith step
-         should process.
-    """
-    if context.num_steps:
-        # All requests must run until num_engine_steps. This implies
-        # that their output lengths must be equal to num_engine_steps.
-        return [context.batch_size] * context.num_steps
-
-    assert (
-        context.complete_num_requests_per_step
-        and context.complete_num_requests_per_step > 0
-    ), (
-        f"Expected a positive complete_num_requests_per_step argument."
-        f"Instead got {context.complete_num_requests_per_step}"
-    )
-
-    # We start dropping after the first decode step.
-    step_requests = [
-        context.batch_size,  # prefill
-        context.batch_size,  # decode
-    ]
-
-    num_running_requests = context.batch_size
-    num_running_requests -= context.complete_num_requests_per_step
-    while num_running_requests > 0:
-        step_requests.append(num_running_requests)
-        num_running_requests -= context.complete_num_requests_per_step
-
-    if step_requests[-1] != 1:
-        # have 1 request running at the last step. This is often
-        # useful
-        step_requests.append(1)
-
-    return step_requests
-
-
-def run_profile(
-    context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
-):
-    print("Run profile with:")
-    for key, value in asdict(context).items():
-        print(f"  {key} = {value}")
-
-    requests_per_step: list[int] = determine_requests_per_step(context)
-
-    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
-        context.batch_size, requests_per_step
-    )
-
-    num_steps_to_profile: int = len(requests_per_step)
-    max_output_len: int = max(ol_nr.keys())
-    assert max_output_len >= 1
-
-    # Create sampling params
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        # max_tokens is set on a per-request basis.
-        max_tokens=None,
-        ignore_eos=True,
-    )
-
-    # Create LLM
-    llm = LLM(**asdict(context.engine_args))
-    batch_size = context.batch_size
-    prompt_len = context.prompt_len
-
-    scheduler_config = llm.llm_engine.vllm_config.scheduler_config
-    max_model_len = llm.llm_engine.model_config.max_model_len
-    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
-    max_num_seqs = scheduler_config.max_num_seqs
-
-    if batch_size * prompt_len > max_num_batched_tokens:
-        print(
-            f"ERROR: chosen batch_size * prompt_len "
-            f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
-            f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
-            f"and therefore cannot be run in a single profile step, please "
-            f"choose a smaller batch size or prompt length, or increase "
-            f"--max-num-batched-tokens"
-        )
-        sys.exit(-1)
-    if batch_size > max_num_seqs:
-        print(
-            f"ERROR: chosen batch_size ({batch_size}) is larger than "
-            f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
-            f"single profile step, please choose a smaller batch size"
-        )
-        sys.exit(-1)
-    print(
-        "llm.llm_engine.model_config.max_model_len: ",
-        llm.llm_engine.model_config.max_model_len,
-    )
-    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
-        print(
-            f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
-            f"{max_output_len} = {prompt_len + max_output_len}) is larger "
-            f"than the model's max_model_len ({max_model_len}), please "
-            f"choose a smaller prompt_len or max_output_len, or increase "
-            f"--max-model-len"
-        )
-        sys.exit(-1)
-
-    def add_requests():
-        def get_output_len_generator() -> Generator[int, Any, Any]:
-            for output_len, num_reqs in ol_nr.items():
-                for _ in range(num_reqs):
-                    yield output_len
-
-        output_len_generator = get_output_len_generator()
-        for i in range(batch_size):
-            sampling_params.max_tokens = next(output_len_generator)
-            assert isinstance(sampling_params.max_tokens, int)
-
-            prompt_token_ids = torch.randint(
-                llm.get_tokenizer().vocab_size, size=(prompt_len,)
-            ).tolist()
-
-            llm.llm_engine.add_request(
-                request_id=f"seq{i}",
-                prompt={"prompt_token_ids": prompt_token_ids},
-                params=sampling_params,
-            )
-
-    def abort_requests():
-        for i in range(batch_size):
-            llm.llm_engine.abort_request(f"seq{i}")
-
-    # Warm up run
-    print("Warm up run ...")
-    add_requests()
-    llm.llm_engine.step()  # Prefill
-    llm.llm_engine.step()  # Decode
-    abort_requests()
-
-    print("Profile run ...")
-    add_requests()
-
-    with layerwise_profile() as prefill_prof:
-        llm.llm_engine.step()  # First step is prefill
-
-    decode_profs = []
-    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
-        num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
-        with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
-            llm.llm_engine.step()
-        decode_profs.append(decode_prof)
-
-    decode_results_list = [prof.results for prof in decode_profs]
-    prefill_results = prefill_prof.results
-    has_decode = len(decode_results_list) > 0
-
-    LINE_WIDTH = 80
-    print("=" * LINE_WIDTH)
-    print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
-    print("=" * LINE_WIDTH)
-    print()
-    prefill_results.print_model_table()
-
-    if has_decode:
-        print()
-        print("=" * LINE_WIDTH)
-        print(
-            f"= First Decode Step Model Table "
-            f"(prompt_len={prompt_len}, batch_size={batch_size})"
-        )
-        print("=" * LINE_WIDTH)
-        print()
-        decode_results_list[0].print_model_table()
-
-    print()
-    print("=" * LINE_WIDTH)
-    print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
-    print("=" * LINE_WIDTH)
-    print()
-    prefill_results.print_summary_table()
-
-    if has_decode:
-        print()
-        print("=" * LINE_WIDTH)
-        print(
-            f"= First Decode Step Summary Table "
-            f"(prompt_len={prompt_len}, batch_size={batch_size})"
-        )
-        print("=" * LINE_WIDTH)
-        print()
-        decode_results_list[0].print_summary_table()
-
-    if csv_output:
-        csv_filename_base = (
-            csv_output[:-4] if csv_output.endswith(".csv") else csv_output
-        )
-        prefill_results.export_model_stats_table_csv(
-            csv_filename_base + "_prefill_model_table.csv"
-        )
-        prefill_results.export_summary_stats_table_csv(
-            csv_filename_base + "_prefill_summary_table.csv"
-        )
-
-        if has_decode:
-            decode_results_list[0].export_model_stats_table_csv(
-                csv_filename_base + "_decode_model_table.csv"
-            )
-            decode_results_list[0].export_summary_stats_table_csv(
-                csv_filename_base + "_decode_summary_table.csv"
-            )
-
-    if json_output:
-        cuda_devices = [
-            torch.cuda.get_device_properties(dev_idx)
-            for dev_idx in range(torch.cuda.device_count())
-        ]
-
-        json_dict = {
-            "context": {
-                "python_version": f"{sys.version}",
-                "torch_version": f"{torch.__version__}",
-                "torch_cuda_version": f"{torch.version.cuda}",
-                "cuda_devices": f"{cuda_devices}",
-                **asdict(context),
-            },
-            "prefill": prefill_results.convert_stats_to_dict(),
-        }
-
-        if has_decode:
-            for idx, dr in enumerate(decode_results_list):
-                json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
-
-        # Add .json to json_output filename if it doesn't exist already.
-        json_output_file = (
-            json_output if json_output.endswith(".json") else json_output + ".json"
-        )
-        with open(json_output_file, "w+") as f:
-            json.dump(json_dict, f, indent=2)
-        pass
-
-    if context.save_chrome_traces_folder is not None:
-        os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
-        prefill_prof.profiler.export_chrome_trace(
-            context.save_chrome_traces_folder + "/prefill.json"
-        )
-        for idx, decode_prof in enumerate(decode_profs):
-            decode_prof.profiler.export_chrome_trace(
-                context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
-            )
-        print(
-            "Traces saved as prefill.json and decode_1.json, etc."
-            f" in folder {context.save_chrome_traces_folder}"
-        )
-
-
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="""
-Profile a model
-
-    example:
-    ```
-    python examples/offline_inference/profiling.py \\
-        --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
-        --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
-        --enforce-eager run_num_steps -n 2
-    ```
-
-    then you can use various tools to analyze the json output
-    terminal ascii tables:
-        ```
-        python tools/profiler/print_layerwise_table.py \\
-            --json-trace Llama31-8b-FP8.json --phase prefill --table summary
-        ```
-    or create matplotlib stacked bar charts:
-        ```
-        python tools/profiler/visualize_layerwise_profile.py \\
-            --json-trace Llama31-8b-FP8.json \\
-            --output-directory profile_breakdown --plot-metric pct_cuda_time
-        ```
-""",
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--csv",
-        type=str,
-        default=None,
-        help="Export the results as multiple csv file. This should be the root "
-        "filename, will create <filename>_prefill_model_table.csv, "
-        "<filename>_prefill_summary_table.csv, "
-        "<filename>_decode_model_table.csv, and "
-        "<filename>_decode_summary_table.csv",
-    )
-    parser.add_argument(
-        "--json",
-        type=str,
-        default=None,
-        help="Export the results as a json file. This should be the filename",
-    )
-    parser.add_argument(
-        "--save-chrome-traces-folder",
-        type=str,
-        help="Save chrome traces for the prefill and decode "
-        "will save traces as prefill.json and decode_1.json, "
-        "etc. inside this folder",
-    )
-    parser.add_argument(
-        "--prompt-len",
-        type=int,
-        default=PROMPT_LEN_DEFAULT,
-        help=f"Length of the random prompt to use when profiling, all batched "
-        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=BATCH_SIZE_DEFAULT,
-        help=f"Number of requests to run as a single batch, "
-        f"default={BATCH_SIZE_DEFAULT}",
-    )
-
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    run_num_steps_parser = subparsers.add_parser(
-        "run_num_steps", help="This variation profiles n engine.step() invocations."
-    )
-    run_num_steps_parser.add_argument(
-        "-n",
-        "--num-steps",
-        type=int,
-        help="Number of engine steps to profile.\n"
-        "Setting it to 1, profiles only the prefill step.\n"
-        "Setting it to 2, profiles the prefill and first decode step\n"
-        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
-        "and so on ...",
-    )
-
-    run_to_completion_parser = subparsers.add_parser(
-        "run_to_completion",
-        help="This variation profiles all the engine.step() invocations"
-        "until the engine exhausts all submitted requests.",
-    )
-    run_to_completion_parser.add_argument(
-        "-n",
-        "--complete-num-requests-per-step",
-        type=int,
-        help="Complete complete_num_requests_per_step requests every decode step."
-        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
-        "the profiler is run for 6 engine steps, with the steps processing, "
-        "128, 128, 96, 64, 32, 1 requests respectively.\n"
-        "Note that we tack-on a one-request step at the end as it is often "
-        "useful.",
-    )
-
-    EngineArgs.add_cli_args(parser)
-
-    return parser.parse_args()
-
-
-def main(args):
-    context = ProfileContext(
-        engine_args=EngineArgs.from_cli_args(args),
-        **{
-            k: v
-            for k, v in vars(args).items()
-            if k in inspect.signature(ProfileContext).parameters
-        },
-    )
-    run_profile(context, csv_output=args.csv, json_output=args.json)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
--- a/examples/offline_inference/qwen_1m.py
+++ b/examples/offline_inference/qwen_1m.py
@ -5,7 +5,6 @@ from urllib.request import urlopen

 from vllm import LLM, SamplingParams

-os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
 os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"


--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -49,6 +49,7 @@ def get_custom_mm_prompts(num_prompts):
 def parse_args():
    parser = FlexibleArgumentParser()
    add_dataset_parser(parser)
+    parser.add_argument("--test", action="store_true")
    parser.add_argument(
        "--method",
        type=str,
@ -60,6 +61,7 @@ def parse_args():
    parser.add_argument("--tp", type=int, default=1)
    parser.add_argument("--enforce-eager", action="store_true")
    parser.add_argument("--enable-chunked-prefill", action="store_true")
+    parser.add_argument("--max-model-len", type=int, default=16384)
    parser.add_argument("--temp", type=float, default=0)
    parser.add_argument("--top-p", type=float, default=1.0)
    parser.add_argument("--top-k", type=int, default=-1)
@ -71,8 +73,7 @@ def parse_args():
    return parser.parse_args()


-def main():
-    args = parse_args()
+def main(args):
    args.endpoint_type = "openai-chat"

    model_dir = args.model_dir
@ -134,7 +135,7 @@ def main():
        gpu_memory_utilization=0.8,
        speculative_config=speculative_config,
        disable_log_stats=False,
-        max_model_len=16384,
+        max_model_len=args.max_model_len,
        limit_mm_per_prompt={"image": 5},
        disable_chunked_mm_input=True,
    )
@ -198,6 +199,39 @@ def main():
        acceptance_rate = acceptance_counts[i] / num_drafts if num_drafts > 0 else 0
        print(f"acceptance at token {i}: {acceptance_rate:.2f}")

+    return acceptance_length
+

 if __name__ == "__main__":
-    main()
+    args = parse_args()
+    acceptance_length = main(args)
+
+    if args.test:
+        # takes ~30s to run on 1xH100
+        assert args.method in ["eagle", "eagle3"]
+        assert args.tp == 1
+        assert args.num_spec_tokens == 3
+        assert args.dataset_name == "hf"
+        assert args.dataset_path == "philschmid/mt-bench"
+        assert args.num_prompts == 80
+        assert args.temp == 0
+        assert args.top_p == 1.0
+        assert args.top_k == -1
+        assert args.enable_chunked_prefill
+
+        # check acceptance length is within 2% of expected value
+        rtol = 0.02
+        expected_acceptance_length = 2.296 if args.method == "eagle" else 2.811
+
+        assert (
+            acceptance_length <= (1 + rtol) * expected_acceptance_length
+            and acceptance_length >= (1 - rtol) * expected_acceptance_length
+        ), (
+            f"acceptance_length {acceptance_length} is not "
+            f"within {rtol * 100}% of {expected_acceptance_length}"
+        )
+
+        print(
+            f"Test passed! Expected AL: "
+            f"{expected_acceptance_length}, got {acceptance_length}"
+        )
--- a/examples/offline_inference/torchrun_dp_example.py
+++ b/examples/offline_inference/torchrun_dp_example.py
@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+experimental support for data-parallel inference with torchrun
+Note the data load balancing and distribution is done out of the vllm engine,
+no internal lb supported in external_launcher mode.
+"""
+
+from vllm import LLM, SamplingParams
+
+# Create prompts, the same across all ranks
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+] * 50
+
+# Create sampling parameters, the same across all ranks
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Use `distributed_executor_backend="external_launcher"` so that
+# this llm engine/instance only creates one worker.
+# it is important to set an explicit seed to make sure that
+# all ranks have the same random seed, so that sampling can be
+# deterministic across ranks.
+llm = LLM(
+    model="microsoft/Phi-mini-MoE-instruct",
+    tensor_parallel_size=1,
+    data_parallel_size=2,
+    pipeline_parallel_size=1,
+    enable_expert_parallel=False,
+    distributed_executor_backend="external_launcher",
+    max_model_len=4096,
+    gpu_memory_utilization=0.6,
+    seed=1,
+)
+
+dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank
+dp_size = llm.llm_engine.vllm_config.parallel_config.data_parallel_size
+
+prompts = [
+    f"{idx}.{prompt}" for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank
+]
+
+outputs = llm.generate(prompts, sampling_params)
+
+
+# all ranks will have the same outputs
+print("-" * 50)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
+    print("-" * 50)
+"""
+Further tips:
+
+1. to communicate control messages across all ranks, use the cpu group,
+a PyTorch ProcessGroup with GLOO backend.
+
+```python
+from vllm.distributed.parallel_state import get_world_group
+cpu_group = get_world_group().cpu_group
+torch_rank = dist.get_rank(group=cpu_group)
+if torch_rank == 0:
+    # do something for rank 0, e.g. saving the results to disk.
+```
+
+2. to communicate data across all ranks, use the model's device group,
+a PyTorch ProcessGroup with NCCL backend.
+```python
+from vllm.distributed.parallel_state import get_world_group
+device_group = get_world_group().device_group
+```
+
+3. to access the model directly in every rank, use the following code:
+```python
+llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
+```
+"""
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -126,6 +126,23 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
    )


+# Dots-OCR
+def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
+    engine_args = EngineArgs(
+        model="rednote-hilab/dots.ocr",
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

@ -1676,6 +1693,7 @@ model_example_map = {
    "aya_vision": run_aya_vision,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
+    "dots_ocr": run_dots_ocr,
    "command_a_vision": run_command_a_vision,
    "deepseek_vl_v2": run_deepseek_vl2,
    "ernie45_vl": run_ernie45_vl,
--- a/examples/online_serving/openai_embedding_long_text/README.md
+++ b/examples/online_serving/openai_embedding_long_text/README.md
@ -42,7 +42,7 @@ python client.py

 ### Server Configuration

-The key parameters for chunked processing are in the `--override-pooler-config`:
+The key parameters for chunked processing are in the `--pooler-config`:

 ```json
 {
--- a/examples/online_serving/openai_embedding_long_text/client.py
+++ b/examples/online_serving/openai_embedding_long_text/client.py
@ -13,7 +13,7 @@ Prerequisites:
   
   # MEAN pooling (processes all chunks, recommended for complete coverage)
   vllm serve intfloat/multilingual-e5-large \
-     --override-pooler-config \
+     --pooler-config \
      '{"pooling_type": "MEAN", "normalize": true, ' \
      '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
     --served-model-name multilingual-e5-large \
@ -23,7 +23,7 @@ Prerequisites:

   # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
   vllm serve BAAI/bge-large-en-v1.5 \
-     --override-pooler-config \
+     --pooler-config \
      '{"pooling_type": "CLS", "normalize": true, ' \
      '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
     --served-model-name bge-large-en-v1.5 \
--- a/examples/online_serving/openai_embedding_long_text/service.sh
+++ b/examples/online_serving/openai_embedding_long_text/service.sh
@ -103,7 +103,7 @@ POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enab
 vllm serve "$MODEL_NAME" \
  --tensor-parallel-size "$GPU_COUNT" \
  --enforce-eager \
-  --override-pooler-config "$POOLER_CONFIG" \
+  --pooler-config "$POOLER_CONFIG" \
  --served-model-name ${MODEL_CODE} \
  --api-key "$API_KEY" \
  --trust-remote-code \
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import argparse
-import dataclasses
 import json
 import logging
 import os
@ -327,12 +325,7 @@ def main():


    if args.command == "serialize":
-        eng_args_dict = {f.name: getattr(args, f.name) for f in
-                        dataclasses.fields(EngineArgs)}
-
-        engine_args = EngineArgs.from_cli_args(
-            argparse.Namespace(**eng_args_dict)
-        )
+        engine_args = EngineArgs.from_cli_args(args)

        input_dir = tensorizer_dir.rstrip('/')
        suffix = args.suffix if args.suffix else uuid.uuid4().hex
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@ -102,6 +102,7 @@ plugins:
          - https://numpy.org/doc/stable/objects.inv
          - https://pytorch.org/docs/stable/objects.inv
          - https://psutil.readthedocs.io/en/stable/objects.inv
+          - https://huggingface.co/docs/transformers/main/en/objects.inv

 markdown_extensions:
  - attr_list
--- a/pyproject.toml
+++ b/pyproject.toml
@ -70,7 +70,6 @@ line-length = 80
 "vllm/_version.py" = ["ALL"]
 # Python 3.8 typing - skip V0 code
 "vllm/attention/**/*.py" = ["UP006", "UP035"]
-"vllm/core/**/*.py" = ["UP006", "UP035"]
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
@ -111,28 +110,6 @@ ignore_missing_imports = true
 check_untyped_defs = true
 follow_imports = "silent"

-# After fixing type errors resulting from follow_imports: "skip" -> "silent",
-# move the directory here and remove it from tools/mypy.sh
-files = [
-    "vllm/*.py",
-    "vllm/assets",
-    "vllm/entrypoints",
-    "vllm/core",
-    "vllm/inputs",
-    "vllm/logging_utils",
-    "vllm/multimodal",
-    "vllm/platforms",
-    "vllm/transformers_utils",
-    "vllm/triton_utils",
-    "vllm/usage",
-]
-# TODO(woosuk): Include the code from Megatron and HuggingFace.
-exclude = [
-    "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
-    # Ignore triton kernels in ops.
-    'vllm/attention/ops/.*\.py$'
-]
-
 [tool.isort]
 skip_glob = [
    ".buildkite/*",
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -24,7 +24,7 @@ outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128
+#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28
 absl-py==2.1.0
    # via rouge-score
 accelerate==1.0.1
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@ -14,14 +14,4 @@ nixl==0.3.0
 tpu_info==0.4.0

 # Install torch_xla
--pre
--extra-index-url https://download.pytorch.org/whl/nightly/cpu
--find-links https://storage.googleapis.com/libtpu-wheels/index.html
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.9.0.dev20250730
-torchvision==0.24.0.dev20250730
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
-
+torch_xla[tpu, pallas]==2.8.0
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -11,7 +11,7 @@ from unittest.mock import Mock
 import pytest
 import torch

-from vllm import LLM, envs
+from vllm import LLM
 from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1

 from ..conftest import HfRunner, VllmRunner
@ -26,14 +26,6 @@ MODELS = [
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
    llm = LLM("distilbert/distilgpt2")
@ -76,17 +68,6 @@ def test_models(
    model_executor: str,
    enable_prompt_embeds: bool,
 ) -> None:
-
-    if enable_prompt_embeds and envs.is_set(
-            "VLLM_USE_V1") and envs.VLLM_USE_V1:
-        pytest.skip("enable_prompt_embeds is not supported in v1.")
-
-    if not envs.VLLM_USE_V1:
-        if async_scheduling:
-            pytest.skip("async_scheduling only supported in v1.")
-        if model_executor != "uni":
-            pytest.skip("only test uniproc executor for v0.")
-
    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")
@ -164,11 +145,6 @@ def test_models_distributed(
    extra_env: dict[str, str],
    enable_prompt_embeds: bool,
 ) -> None:
-
-    if enable_prompt_embeds and envs.is_set(
-            "VLLM_USE_V1") and envs.VLLM_USE_V1:
-        pytest.skip("enable_prompt_embeds is not supported in v1.")
-
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")

--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@ -122,11 +122,12 @@ def test_cumem_with_cudagraph():
        # sleep mode with safetensors
        ("meta-llama/Llama-3.2-1B", True),
        # sleep mode with pytorch checkpoint
-        ("facebook/opt-125m", False),
+        ("facebook/opt-125m", True),
    ])
 def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        assert use_v1
+        m.setenv("VLLM_USE_V1", "1")
        free, total = torch.cuda.mem_get_info()
        used_bytes_baseline = total - free  # in case other process is running
        llm = LLM(model, enable_sleep_mode=True)
--- a/tests/build_cython.py
+++ b/tests/build_cython.py
@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import Cython.Compiler.Options
-from Cython.Build import cythonize
-from setuptools import setup
-
-Cython.Compiler.Options.annotate = True
-
-infiles = []
-
-infiles += [
-    "vllm/engine/llm_engine.py",
-    "vllm/transformers_utils/detokenizer.py",
-    "vllm/engine/output_processor/single_step.py",
-    "vllm/outputs.py",
-    "vllm/engine/output_processor/stop_checker.py",
-]
-
-infiles += [
-    "vllm/core/scheduler.py",
-    "vllm/sequence.py",
-    "vllm/core/block_manager.py",
-]
-
-infiles += [
-    "vllm/model_executor/layers/sampler.py",
-    "vllm/sampling_params.py",
-    "vllm/utils/__init__.py",
-]
-
-setup(ext_modules=cythonize(infiles,
-                            annotate=False,
-                            force=True,
-                            compiler_directives={
-                                'language_level': "3",
-                                'infer_types': True
-                            }))
-
-# example usage: python3 build_cython.py build_ext --inplace
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import weakref
 from collections.abc import Sequence
 from copy import deepcopy
 from typing import Callable, Union
@ -10,7 +11,26 @@ from torch._ops import OpOverload

 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.inductor_pass import InductorPass
-from vllm.config import get_current_vllm_config
+from vllm.compilation.pass_manager import with_pattern_match_debug
+from vllm.compilation.vllm_inductor_pass import VllmInductorPass
+from vllm.config import VllmConfig, get_current_vllm_config
+
+
+class LazyInitPass(InductorPass):
+    """
+    If there's a pass that we want to initialize lazily in a test,
+    we can wrap it in LazyInitPass, which will initialize the pass when invoked
+    and then immediately invoke it.
+    """
+
+    def __init__(self, pass_cls: type[VllmInductorPass],
+                 vllm_config: VllmConfig):
+        self.pass_cls = pass_cls
+        self.vllm_config = weakref.proxy(vllm_config)  # avoid cycle
+
+    def __call__(self, graph: fx.Graph) -> None:
+        self.pass_ = self.pass_cls(self.vllm_config)
+        self.pass_(graph)


 class TestBackend:
@ -40,10 +60,16 @@ class TestBackend:
                          example_inputs,
                          config_patches=self.inductor_config)

+    @with_pattern_match_debug
    def post_pass(self, graph: fx.Graph):
        self.graph_pre_pass = deepcopy(graph)
+
+        VllmInductorPass.dump_prefix = 0
        for pass_ in self.custom_passes:
            pass_(graph)
+            VllmInductorPass.dump_prefix += 1
+
+        VllmInductorPass.dump_prefix = None

        self.graph_post_pass = deepcopy(graph)
        # assign by reference, will reflect the final state of the graph
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@ -46,7 +46,10 @@ backend_configs = {
    # FA3 on Hopper
    "FA3":
    BackendConfig(name="FA3",
-                  env_vars={"VLLM_FLASH_ATTN_VERSION": "3"},
+                  env_vars={
+                      "VLLM_FLASH_ATTN_VERSION": "3",
+                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+                  },
                  comp_config={
                      "cudagraph_mode": "FULL",
                  },
@ -66,6 +69,7 @@ backend_configs = {
    BackendConfig(name="FlashAttentionMLA",
                  env_vars={
                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
+                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
                  },
                  comp_config={
                      "cudagraph_mode": "FULL_DECODE_ONLY",
@ -89,7 +93,10 @@ backend_configs = {
    # FA2
    "FA2":
    BackendConfig(name="FA2",
-                  env_vars={"VLLM_FLASH_ATTN_VERSION": "2"},
+                  env_vars={
+                      "VLLM_FLASH_ATTN_VERSION": "2",
+                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+                  },
                  comp_config={
                      "cudagraph_mode": "FULL",
                  }),
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@ -15,6 +15,7 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                         VllmConfig, set_current_vllm_config)
 from vllm.envs import VLLM_USE_V1
 from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils import is_torch_equal_or_newer

 # This import automatically registers `torch.ops.silly.attention`
 from ..silly_attention import get_global_counter, reset_global_counter
@ -50,16 +51,21 @@ class SillyModel(nn.Module):
        return x


-@pytest.mark.parametrize("use_inductor", [True, False])
-@torch.inference_mode()
-def test_simple_piecewise_compile(use_inductor):
-    assert VLLM_USE_V1
-
+def _run_simple_model(
+    splitting_ops,
+    use_inductor_graph_partition,
+    use_inductor,
+    expected_num_piecewise_graphs_seen,
+    expected_num_piecewise_capturable_graphs_seen,
+    expected_num_backend_compilations,
+    expected_num_cudagraph_captured,
+):
    vllm_config = VllmConfig(compilation_config=CompilationConfig(
        level=CompilationLevel.PIECEWISE,
        use_cudagraph=True,
        use_inductor=use_inductor,
-        splitting_ops=["silly.attention"],
+        splitting_ops=splitting_ops,
+        use_inductor_graph_partition=use_inductor_graph_partition,
        cudagraph_copy_inputs=True,
        cudagraph_capture_sizes=[1, 2],
    ))
@ -70,11 +76,11 @@ def test_simple_piecewise_compile(use_inductor):

    with compilation_counter.expect(
            num_graphs_seen=1,  # one graph for the model
-            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
-            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
-            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_captured=
-            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+            num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+            num_piecewise_capturable_graphs_seen=
+            expected_num_piecewise_capturable_graphs_seen,
+            num_backend_compilations=expected_num_backend_compilations,
+            num_cudagraph_captured=expected_num_cudagraph_captured,
    ), set_forward_context(None,
                           vllm_config=vllm_config):  # background context
        # warm up with background context
@ -104,3 +110,46 @@ def test_simple_piecewise_compile(use_inductor):
            output = model(input)
        assert get_global_counter() == 2
        assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
+
+
+@pytest.mark.parametrize("use_inductor", [True, False])
+@torch.inference_mode()
+def test_simple_piecewise_compile(use_inductor):
+    assert VLLM_USE_V1
+    _run_simple_model(
+        splitting_ops=["silly.attention"],
+        use_inductor_graph_partition=False,
+        use_inductor=use_inductor,
+        expected_num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
+        expected_num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
+        expected_num_backend_compilations=
+        3,  # num_piecewise_capturable_graphs_seen
+        expected_num_cudagraph_captured=
+        6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    )
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
+def test_simple_inductor_graph_partition(splitting_ops):
+    assert VLLM_USE_V1
+    if not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
+    _run_simple_model(
+        # inductor graph partition automatically resets splitting_ops
+        # to be an empty list
+        splitting_ops=splitting_ops,
+        use_inductor_graph_partition=True,
+        use_inductor=True,
+        expected_num_piecewise_graphs_seen=
+        1,  # since not splitting at fx graph level
+        expected_num_piecewise_capturable_graphs_seen=
+        1,  # since not splitting at fx graph level
+        expected_num_backend_compilations=
+        1,  # since not splitting at fx graph level
+        expected_num_cudagraph_captured=
+        6,  # inductor graph partition still captures 6
+        # graph, same as fx graph partition.
+    )
--- a/tests/compile/silly_attention.py
+++ b/tests/compile/silly_attention.py
@ -60,4 +60,5 @@ direct_register_custom_op(
    mutates_args=["out"],
    fake_impl=silly_attention_fake,
    target_lib=silly_lib,
+    tags=(torch._C.Tag.cudagraph_unsafe, ),
 )
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@ -294,6 +294,8 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
    compiled_model = torch.compile(model, backend=backend)
    compiled_model(hidden_states)

+    assert async_tp_pass.matched_count == 1
+
    # In pre-nodes, all gather or reduce scatter should exist,
    # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
    backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -20,7 +20,6 @@ class TestSetting:
    tp_size: int
    attn_backend: str
    method: str
-    fullgraph: bool


 # we cannot afford testing the full Cartesian product
@ -36,7 +35,6 @@ class TestSetting:
            tp_size=2,
            attn_backend="FLASH_ATTN",
            method="generate",
-            fullgraph=True,
        ),
        # llama model with quantization
        TestSetting(
@ -46,7 +44,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate",
-            fullgraph=True,
        ),
        # MoE model
        TestSetting(
@ -56,7 +53,6 @@ class TestSetting:
            tp_size=2,
            attn_backend="FLASH_ATTN",
            method="generate",
-            fullgraph=True,
        ),
        # embedding model
        TestSetting(
@ -73,7 +69,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="encode",
-            fullgraph=True,
        ),
        TestSetting(
            model="BAAI/bge-base-en-v1.5",
@ -82,7 +77,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="encode",
-            fullgraph=True,
        ),
        # vision language model
        TestSetting(
@ -92,7 +86,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate_with_image",
-            fullgraph=False,
        ),
    ],
 )
@ -109,9 +102,8 @@ def test_compile_correctness(
    tp_size = test_setting.tp_size
    attn_backend = test_setting.attn_backend
    method = test_setting.method
-    fullgraph = test_setting.fullgraph
-    if cuda_device_count_stateless() != pp_size * tp_size:
-        pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
+    if cuda_device_count_stateless() < pp_size * tp_size:
+        pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
                    f"{cuda_device_count_stateless()}")

    with monkeypatch.context() as m:
@ -149,9 +141,5 @@ def test_compile_correctness(
        ]:
            all_args.append(final_args + [f"-O{level}"])
            all_envs.append({})
-            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
-                # "DYNAMO_ONCE" will always use fullgraph
-                all_envs[-1][
-                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore

        compare_all_settings(model, all_args * 3, all_envs, method=method)
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -4,7 +4,7 @@ import pytest

 import vllm
 from vllm.compilation.counter import compilation_counter
-from vllm.config import VllmConfig
+from vllm.config import CompilationConfig, VllmConfig
 from vllm.utils import _is_torch_equal_or_newer


@ -26,6 +26,14 @@ def test_use_cudagraphs_dynamic(monkeypatch):
    assert not vllm_config.compilation_config.use_cudagraph


+def test_custom_op():
+    # proper syntax
+    _ = CompilationConfig(custom_ops=["+quant_fp8", "-silu_and_mul"])
+
+    with pytest.raises(ValueError, match="Invalid syntax '"):
+        _ = CompilationConfig(custom_ops=["quant_fp8"])
+
+
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
 # NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -3,6 +3,7 @@

 from __future__ import annotations

+import logging
 import tempfile
 from typing import Any, Optional, Union

@ -10,9 +11,13 @@ import pytest
 import torch

 from tests.quantization.utils import is_quant_method_supported
+from tests.v1.attention.utils import _Backend
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationConfig, CompilationLevel, PassConfig
+from vllm.attention.selector import global_force_attn_backend_context_manager
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         PassConfig)
 from vllm.platforms import current_platform
+from vllm.utils import is_torch_equal_or_newer

 from ..utils import create_new_process_for_each_test

@ -79,9 +84,7 @@ def test_full_graph(
 ):
    model, model_kwargs = model_info

-    with monkeypatch.context() as m:
-        # make sure these models can be captured in full graph mode
-        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
+    with monkeypatch.context():
        print(f"MODEL={model}")

        run_model(optimization_level, model, model_kwargs)
@ -107,6 +110,18 @@ def test_full_graph(
        (CompilationConfig(level=CompilationLevel.PIECEWISE,
                           debug_dump_path=tempfile.gettempdir()),
         ("facebook/opt-125m", {})),
+    ] + [
+        # graph inductor partition
+        (
+            CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                # inductor graph partition uses
+                # torch._C.Tag.cudagraph_unsafe to specify splitting ops
+                use_inductor_graph_partition=True,
+                cudagraph_mode=CUDAGraphMode.PIECEWISE,
+                compile_sizes=[1, 2]),
+            model) for model in models_list(all=False)
+        if is_torch_equal_or_newer("2.9.0.dev")
    ])
 # only test some of the models
@create_new_process_for_each_test()
@ -114,11 +129,51 @@ def test_custom_compile_config(
    compilation_config: CompilationConfig,
    model_info: tuple[str, dict[str, Any]],
 ):
+    if (compilation_config.use_inductor_graph_partition
+            and not is_torch_equal_or_newer("2.9.0.dev")):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
    model, model_kwargs = model_info
    print(f"MODEL={model}")
    run_model(compilation_config, model, model_kwargs)


+def test_inductor_graph_partition_attn_fusion(caplog_vllm):
+    if not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
+    model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
+    compilation_config = CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_inductor_graph_partition=True,
+        cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        custom_ops=["+quant_fp8"],
+        pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
+    )
+    model_kwargs = {
+        "kv_cache_dtype": "fp8",
+        "max_model_len": 1024,
+    }
+    with caplog_vllm.at_level(
+            logging.DEBUG), global_force_attn_backend_context_manager(
+                _Backend.FLASHINFER):
+        run_model(compilation_config, model, model_kwargs)
+
+    try:
+        assert ("Fused quantization onto 48 attention nodes"
+                in caplog_vllm.text), caplog_vllm.text
+    except AssertionError:
+        # Note: this message is only triggered when the compilation goes
+        # through the custom pass. Due to multiple layers of cache on
+        # PyTorch side, the compilation of a graph may be cached such
+        # that custom pass directly goes through cache. In this case,
+        # we go through this branch and assert that the pass is not
+        # triggered.
+        assert "Fused quantization" not in caplog_vllm.text
+
+
 def run_model(compile_config: Union[int, CompilationConfig], model: str,
              model_kwargs: dict[str, Any]):
    prompts = [
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@ -8,9 +8,10 @@ import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import FUSED_OPS, FusionPass
+from vllm.compilation.fusion import FUSED_OPS, RMSNormQuantFusionPass
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    QuantKey, kFp8DynamicTokenSym, kFp8StaticTensorSym)
@ -58,11 +59,12 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
    vllm_config.compilation_config = CompilationConfig(
        pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True))
    noop_pass = NoOpEliminationPass(vllm_config)
-    fusion_pass = FusionPass.instance(vllm_config)
+    fusion_pass = RMSNormQuantFusionPass(vllm_config)
+    cleanup_pass = PostCleanupPass(vllm_config)
    act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config)

-    passes = [noop_pass, fusion_pass, act_quant_fusion_pass
-              ] if do_fusion else [noop_pass]
+    passes = [noop_pass, fusion_pass, act_quant_fusion_pass, cleanup_pass
+              ] if do_fusion else [noop_pass, cleanup_pass]
    func_pass = FixFunctionalizationPass(vllm_config)
    backend_func = TestBackend(*passes, func_pass)
    backend_no_func = TestBackend(*passes)
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@ -4,11 +4,11 @@
 import pytest
 import torch

-import vllm.envs as envs
 import vllm.plugins
 from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
-                                     FusionPass)
+                                     RMSNormQuantFusionPass)
 from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
                         VllmConfig)
 from vllm.model_executor.layers.layernorm import RMSNorm
@ -79,15 +79,15 @@ class TestModel(torch.nn.Module):


@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
-@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
+@pytest.mark.parametrize("hidden_size", [64])
+@pytest.mark.parametrize("num_tokens", [257])
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize("static", [True, False])
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
@pytest.mark.parametrize("cuda_force_torch",
                         [True, False] if cutlass_fp8_supported() else [True])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
                    reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
                              cuda_force_torch):
@ -104,9 +104,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
    with vllm.config.set_current_vllm_config(vllm_config):
        # Reshape pass is needed for the fusion pass to work
        noop_pass = NoOpEliminationPass(vllm_config)
-        fusion_pass = FusionPass.instance(vllm_config)
+        fusion_pass = RMSNormQuantFusionPass(vllm_config)
+        cleanup_pass = PostCleanupPass(vllm_config)

-        backend = TestBackend(noop_pass, fusion_pass)
+        backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
        model = TestModel(hidden_size, eps, static, cuda_force_torch)

        # First dimension dynamic
@ -128,6 +129,8 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,

        torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)

+        assert fusion_pass.matched_count == 2
+
        # In pre-nodes, fp8 quant should be there and fused kernels should not
        backend.check_before_ops(model.ops_in_model_before())

--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@ -9,6 +9,7 @@ import vllm.envs as envs
 from vllm.compilation.collective_fusion import AllReduceFusionPass
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import (CompilationConfig, CompilationLevel, DeviceConfig,
                         ModelConfig, PassConfig, VllmConfig)
 from vllm.distributed import tensor_model_parallel_all_reduce
@ -215,8 +216,10 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
    all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
    noop_pass = NoOpEliminationPass(vllm_config)
    func_pass = FixFunctionalizationPass(vllm_config)
+    cleanup_pass = PostCleanupPass(vllm_config)

-    backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass)
+    backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass,
+                          cleanup_pass)

    token_num = batch_size * seq_len
    model = test_model_cls(hidden_size, token_num)
@ -227,6 +230,7 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
    compiled_model = torch.compile(model, backend=backend)
    compiled_model(hidden_states, residual)

+    assert all_reduce_fusion_pass.matched_count == 1
    backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
    backend.check_after_ops(model.ops_in_model_after())
    del all_reduce_fusion_pass
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -6,18 +6,19 @@ from typing import Optional
 import pytest
 import torch._dynamo

-from tests.compile.backend import TestBackend
+from tests.compile.backend import LazyInitPass, TestBackend
 from tests.models.utils import check_outputs_equal
 from tests.v1.attention.utils import (BatchSpec, _Backend,
                                      create_common_attn_metadata)
 from vllm import LLM, SamplingParams
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.attention import Attention
+from vllm.attention import Attention, AttentionMetadata
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
                         ModelConfig, PassConfig, SchedulerConfig, VllmConfig,
                         set_current_vllm_config)
@ -27,6 +28,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    Fp8LinearOp)
 from vllm.platforms import current_platform
+from vllm.utils import is_torch_equal_or_newer
 from vllm.v1.kv_cache_interface import AttentionSpec

 FP8_DTYPE = current_platform.fp8_dtype()
@ -53,8 +55,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
    # Use global backends
    global backend, backend_unfused

-    use_v1 = False  # can be made a param once V1 support added
-    monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1)))
+    monkeypatch.setenv("VLLM_USE_V1", "1")
    monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa)))

    # Prompt 4 seems too open-ended, differs between fused and unfused
@ -104,7 +105,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,

    # AttnFusionPass needs attention layers to be registered in config upon init
    # so we initialize it during compilation.
-    attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw)
+    attn_pass = LazyInitPass(AttnFusionPass, vllm_config)
    backend = TestBackend(NoOpEliminationPass(vllm_config), attn_pass)
    llm2 = LLM(model,
               enforce_eager=True,
@ -197,7 +198,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
            device=self.device,
        )

-    def build_attn_metadata(self, batch_size: int, use_hnd: bool):
+    def build_attn_metadata(self, batch_size: int, use_hnd: bool) \
+            -> AttentionMetadata:
        """Initialize attention metadata."""

        # Create common attn metadata
@ -334,11 +336,16 @@ else:
                         [7, 256, 533] if current_platform.is_cuda() else [8])
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("model_name, model_class", MODELS)
-@pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if
-                         current_platform.is_cuda() else [_Backend.ROCM_FLASH])
+@pytest.mark.parametrize("backend",
+                         [_Backend.FLASHINFER] if current_platform.is_cuda()
+                         else [_Backend.TRITON_ATTN_VLLM_V1])
@pytest.mark.parametrize(
    "split_attention",
    [False, True] if current_platform.is_rocm() else [False])
+# TODO(boyuan): test inductor graph partition on rocm
+@pytest.mark.parametrize(
+    "use_inductor_graph_partition",
+    [False] if current_platform.is_rocm() else [False, True])
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
                    reason="Only test ROCm or CUDA")
@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
@ -352,9 +359,15 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                 dtype: torch.dtype, model_name: str,
                                 model_class: type[AttentionQuantPatternModel],
                                 backend: _Backend, split_attention: bool,
-                                 monkeypatch, dist_init):
+                                 use_inductor_graph_partition: bool,
+                                 monkeypatch, dist_init, caplog_vllm):
    """Test AttentionStaticQuantPattern fusion pass"""

+    if use_inductor_graph_partition and not is_torch_equal_or_newer(
+            "2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
    monkeypatch.setenv("VLLM_USE_V1", "1")
    if split_attention:
        monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")
@ -372,6 +385,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
        compilation_config=CompilationConfig(
            level=CompilationLevel.PIECEWISE,
            custom_ops=["+quant_fp8"],
+            use_inductor_graph_partition=use_inductor_graph_partition,
        ),
        cache_config=CacheConfig(cache_dtype="fp8"))

@ -435,15 +449,17 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,

        # Create test backend with fusion passes enabled
        noop_pass = NoOpEliminationPass(vllm_config)
-        attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw
-                                                                    )
-        test_backend = TestBackend(noop_pass, attn_pass)
+        attn_pass = LazyInitPass(AttnFusionPass, vllm_config)
+        cleanup_pass = PostCleanupPass(vllm_config)
+
+        test_backend = TestBackend(noop_pass, attn_pass, cleanup_pass)

        # Compile model with fusion enabled
        model_compiled = torch.compile(model_fused,
                                       backend=test_backend,
                                       fullgraph=True)
        assert model_compiled.attn._o_scale_float is None
+
        result_fused_1 = model_compiled(q, k, v)

        if backend == _Backend.FLASHINFER:
@ -453,6 +469,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
            # _o_scale_float
            assert model_compiled.attn._o_scale_float is not None
            result_fused_2 = model_compiled(q, k, v)
+
            assert model_compiled.attn._o_scale_float is not None

            torch.testing.assert_close(result_unfused,
@ -471,6 +488,9 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
        test_backend.check_before_ops([QUANT_OPS[quant_key]],
                                      fully_replaced=True)

+    # access the underlying `AttnFusionPass` on the `LazyInitPass`
+    assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
+
    # Check attention ops in the graph before and after fusion
    attn_nodes_pre = list(find_op_nodes(ATTN_OP, test_backend.graph_pre_pass))
    attn_nodes_post = list(find_op_nodes(ATTN_OP,
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@ -6,10 +6,12 @@ import torch

 import vllm.envs as envs
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import FusionPass
+from vllm.compilation.fusion import RMSNormQuantFusionPass
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.compilation.sequence_parallelism import SequenceParallelismPass
+from vllm.compilation.vllm_inductor_pass import VllmInductorPass
 from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
                         PassConfig, VllmConfig)
 from vllm.distributed import tensor_model_parallel_all_reduce
@ -104,7 +106,7 @@ class TestQuantModel(torch.nn.Module):
        # Initialize weights
        torch.nn.init.normal_(self.gate_proj, std=0.02)

-        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False)
+        self.fp8_linear = Fp8LinearOp(act_quant_static=True)

        self.scale = torch.rand(1, dtype=torch.float32)
        # Create a weight that is compatible with torch._scaled_mm,
@ -137,8 +139,7 @@ class TestQuantModel(torch.nn.Module):
        # layer normalization
        norm_output, residual_output = self.norm(all_reduce, residual)

-        # for static input quantization
-        # self.fp8_linear is initialized with use_per_token_if_dynamic=False
+        # scaled_mm with static input quantization
        fp8_linear_result = self.fp8_linear.apply(norm_output,
                                                  self.w,
                                                  self.wscale,
@ -253,16 +254,20 @@ def sequence_parallelism_pass_on_test_model(
                                           dtype=dtype,
                                           seed=42)

-    sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
    noop_pass = NoOpEliminationPass(vllm_config)
+    sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
    func_pass = FixFunctionalizationPass(vllm_config)
+    cleanup_pass = PostCleanupPass(vllm_config)

-    passes_for_backend = [noop_pass, sequence_parallelism_pass]
+    passes_for_backend: list[VllmInductorPass] = \
+        [noop_pass, sequence_parallelism_pass]

    if enable_fusion:
-        fusion_pass = FusionPass.instance(vllm_config)
+        fusion_pass = RMSNormQuantFusionPass(vllm_config)
        passes_for_backend.append(fusion_pass)

+    passes_for_backend.append(cleanup_pass)
+
    backend_no_func = TestBackend(*passes_for_backend)
    backend_func = TestBackend(*passes_for_backend, func_pass)

@ -279,6 +284,8 @@ def sequence_parallelism_pass_on_test_model(
    compiled_model_func = torch.compile(model, backend=backend_func)
    compiled_model_func(hidden_states, residual)

+    assert sequence_parallelism_pass.matched_count == 1
+
    # In pre-nodes, all reduce should be there,
    # reduce scatter and all gather should not
    backend_no_func.check_before_ops(model.ops_in_model_before())
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@ -15,6 +15,7 @@ from vllm.compilation.activation_quant_fusion import (
 # yapf: enable
 from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@ -69,6 +70,10 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):

    def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs):
        super().__init__()
+        from vllm.compilation.activation_quant_fusion import (
+            silu_and_mul_nvfp4_quant_supported)
+        assert silu_and_mul_nvfp4_quant_supported
+
        self.silu_and_mul = SiluAndMul()

        # create nvfp4 weight
@ -127,7 +132,11 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class,
        pass_config=PassConfig(enable_fusion=True, enable_noop=True))
    fusion_pass = ActivationQuantFusionPass(config)

-    backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
+    passes = [
+        NoOpEliminationPass(config), fusion_pass,
+        PostCleanupPass(config)
+    ]
+    backend = TestBackend(*passes)
    model = model_class(hidden_size=hidden_size,
                        cuda_force_torch=cuda_force_torch,
                        x=x)
@ -151,6 +160,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class,
                               atol=atol,
                               rtol=rtol)

+    assert fusion_pass.matched_count == 1
+
    # In pre-nodes, quant op should be present and fused kernels should not
    backend.check_before_ops(model.ops_in_model_before())

--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -19,6 +19,7 @@ import socket
 import tempfile
 import threading
 from collections.abc import Generator
+from contextlib import nullcontext
 from enum import Enum
 from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast

@ -39,19 +40,20 @@ from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype
+from vllm.config.model import (ConvertOption, RunnerOption,
+                               _get_and_verify_dtype)
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
-from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
+from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob
 from vllm.multimodal.utils import fetch_image
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.sequence import Logprob
 from vllm.transformers_utils.utils import maybe_model_redirect
+from vllm.utils import set_default_torch_num_threads

 logger = init_logger(__name__)

@ -158,26 +160,6 @@ def cleanup_VLLM_USE_V1(monkeypatch):
        monkeypatch.delenv("VLLM_USE_V1")


-@pytest.fixture(params=[True, False])
-def run_with_both_engines(request, monkeypatch):
-    # Automatically runs tests twice, once with V1 and once without
-    use_v1 = request.param
-    # Tests decorated with `@skip_v1` are only run without v1
-    skip_v0 = request.node.get_closest_marker("skip_v0")
-    skip_v1 = request.node.get_closest_marker("skip_v1")
-
-    if use_v1:
-        if skip_v1:
-            pytest.skip("Skipping test on vllm V1")
-        monkeypatch.setenv('VLLM_USE_V1', '1')
-    else:
-        if skip_v0:
-            pytest.skip("Skipping test on vllm V0")
-        monkeypatch.setenv('VLLM_USE_V1', '0')
-
-    yield
-
-
@pytest.fixture(autouse=True)
 def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
@ -244,39 +226,6 @@ class DecoderPromptType(Enum):
    EMPTY_STR = 3


-@pytest.fixture
-def example_encoder_decoder_prompts(
-) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
-    '''
-    Returns an encoder prompt list and a decoder prompt list, wherein each pair
-    of same-index entries in both lists corresponds to an (encoder prompt,
-    decoder prompt) tuple.
-
-    Returns:
-
-    * Encoder prompt list
-    * Decoder prompt list (reverse of encoder prompt list)
-    '''
-
-    encoder_prompts = []
-    for filename in _TEST_PROMPTS:
-        encoder_prompts += _read_prompts(filename)
-
-    custom_decoder_prompts = encoder_prompts[::-1]
-    empty_str_decoder_prompts = [""] * len(encoder_prompts)
-    none_decoder_prompts = [None] * len(encoder_prompts)
-
-    # NONE decoder prompt type
-    return {
-        DecoderPromptType.NONE:
-        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
-        DecoderPromptType.EMPTY_STR:
-        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
-        DecoderPromptType.CUSTOM:
-        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
-    }
-
-
@pytest.fixture
 def example_long_prompts() -> list[str]:
    prompts = []
@ -338,6 +287,35 @@ class HfRunner:
        is_cross_encoder: bool = False,
        skip_tokenizer_init: bool = False,
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
+        # Set this to avoid hanging issue
+        default_torch_num_threads: Optional[int] = None,
+    ) -> None:
+        init_ctx = (nullcontext() if default_torch_num_threads is None else
+                    set_default_torch_num_threads(default_torch_num_threads))
+
+        with init_ctx:
+            self._init(
+                model_name=model_name,
+                dtype=dtype,
+                model_kwargs=model_kwargs,
+                trust_remote_code=trust_remote_code,
+                is_sentence_transformer=is_sentence_transformer,
+                is_cross_encoder=is_cross_encoder,
+                skip_tokenizer_init=skip_tokenizer_init,
+                auto_cls=auto_cls,
+            )
+
+    def _init(
+        self,
+        model_name: str,
+        dtype: str = "auto",
+        *,
+        model_kwargs: Optional[dict[str, Any]] = None,
+        trust_remote_code: bool = True,
+        is_sentence_transformer: bool = False,
+        is_cross_encoder: bool = False,
+        skip_tokenizer_init: bool = False,
+        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
    ) -> None:
        model_name = maybe_model_redirect(model_name)
        self.model_name = model_name
@ -690,68 +668,6 @@ class HfRunner:
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

-    def generate_encoder_decoder_greedy_logprobs_limit(
-        self,
-        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
-        max_tokens: int,
-        num_logprobs: Optional[int],
-        images: Optional[PromptImageInput] = None,
-        **kwargs: Any,
-    ) -> list[TokensTextLogprobs]:
-        '''
-        Greedy logprobs generation for vLLM encoder/decoder models
-        '''
-
-        all_logprobs: list[list[dict[int, float]]] = []
-        all_output_ids: list[list[int]] = []
-        all_output_strs: list[str] = []
-
-        for i, (encoder_prompt, decoder_prompt) in enumerate(
-                to_enc_dec_tuple_list(encoder_decoder_prompts)):
-            processor_kwargs: dict[str, Any] = {
-                "text": encoder_prompt,
-                "return_tensors": "pt",
-            }
-            if images is not None and images[i] is not None:
-                processor_kwargs["images"] = images[i]
-
-            encoder_inputs = self.processor(**processor_kwargs)
-            encoder_inputs = self.wrap_device(encoder_inputs)
-
-            if decoder_prompt is None:
-                decoder_input_ids = None
-            else:
-                decoder_inputs = self.tokenizer(decoder_prompt,
-                                                return_tensors="pt")
-                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
-
-            output = self.model.generate(
-                decoder_input_ids=decoder_input_ids,
-                use_cache=True,
-                do_sample=False,
-                max_new_tokens=max_tokens,
-                output_hidden_states=True,
-                return_dict_in_generate=True,
-                **encoder_inputs,
-                **kwargs,
-            )
-
-            (
-                seq_logprobs_lst,
-                output_len,
-            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
-                                                num_logprobs)
-
-            all_logprobs.append(seq_logprobs_lst)
-            seq_ids = output.sequences[0]
-            output_ids = seq_ids[-output_len:]
-            all_output_ids.append(output_ids.tolist())
-            all_output_strs.append(self.tokenizer.decode(output_ids))
-
-        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
-
    def encode(self, prompts: list[str], *args,
               **kwargs) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts, *args, **kwargs)
@ -808,26 +724,32 @@ class VllmRunner:
        enable_chunked_prefill: Optional[bool] = False,
        swap_space: int = 4,
        enforce_eager: Optional[bool] = False,
+        # Set this to avoid hanging issue
+        default_torch_num_threads: Optional[int] = None,
        **kwargs,
    ) -> None:
-        self.llm = LLM(
-            model=model_name,
-            runner=runner,
-            convert=convert,
-            tokenizer=tokenizer_name,
-            tokenizer_mode=tokenizer_mode,
-            trust_remote_code=trust_remote_code,
-            dtype=dtype,
-            seed=seed,
-            swap_space=swap_space,
-            enforce_eager=enforce_eager,
-            disable_log_stats=disable_log_stats,
-            tensor_parallel_size=tensor_parallel_size,
-            max_model_len=max_model_len,
-            block_size=block_size,
-            enable_chunked_prefill=enable_chunked_prefill,
-            **kwargs,
-        )
+        init_ctx = (nullcontext() if default_torch_num_threads is None else
+                    set_default_torch_num_threads(default_torch_num_threads))
+
+        with init_ctx:
+            self.llm = LLM(
+                model=model_name,
+                runner=runner,
+                convert=convert,
+                tokenizer=tokenizer_name,
+                tokenizer_mode=tokenizer_mode,
+                trust_remote_code=trust_remote_code,
+                dtype=dtype,
+                seed=seed,
+                swap_space=swap_space,
+                enforce_eager=enforce_eager,
+                disable_log_stats=disable_log_stats,
+                tensor_parallel_size=tensor_parallel_size,
+                max_model_len=max_model_len,
+                block_size=block_size,
+                enable_chunked_prefill=enable_chunked_prefill,
+                **kwargs,
+            )

    def get_inputs(
        self,
@ -940,26 +862,6 @@ class VllmRunner:
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)

-    def generate_encoder_decoder_w_logprobs(
-        self,
-        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
-        sampling_params: SamplingParams,
-    ) -> Union[list[TokensTextLogprobs],
-               list[TokensTextLogprobsPromptLogprobs]]:
-        '''
-        Logprobs generation for vLLM encoder/decoder models
-        '''
-
-        assert sampling_params.logprobs is not None
-        req_outputs = self.llm.generate(encoder_decoder_prompts,
-                                        sampling_params=sampling_params)
-        toks_str_logsprobs_prompt_logprobs = (
-            self._final_steps_generate_w_logprobs(req_outputs))
-        # Omit prompt logprobs if not required by sampling params
-        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.prompt_logprobs is None else
-                toks_str_logsprobs_prompt_logprobs)
-
    def generate_greedy(
        self,
        prompts: Union[list[str], list[torch.Tensor]],
@ -1037,29 +939,6 @@ class VllmRunner:

        return perplexities

-    def generate_encoder_decoder_greedy_logprobs(
-        self,
-        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
-        max_tokens: int,
-        num_logprobs: Optional[int],
-        num_prompt_logprobs: Optional[int] = None,
-        skip_special_tokens: bool = True,
-    ) -> Union[list[TokensTextLogprobs],
-               list[TokensTextLogprobsPromptLogprobs]]:
-        greedy_logprobs_params = SamplingParams(
-            temperature=0.0,
-            max_tokens=max_tokens,
-            logprobs=num_logprobs,
-            prompt_logprobs=(num_prompt_logprobs),
-            skip_special_tokens=skip_special_tokens,
-        )
-        '''
-        Greedy logprobs generation for vLLM encoder/decoder models
-        '''
-
-        return self.generate_encoder_decoder_w_logprobs(
-            encoder_decoder_prompts, greedy_logprobs_params)
-
    def generate_beam_search(
        self,
        prompts: list[str],
@ -1124,17 +1003,7 @@ class VllmRunner:
        return [req_output.outputs.score for req_output in req_outputs]

    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        if hasattr(self.llm.llm_engine, "model_executor"):
-            # This works either in V0 or in V1 with
-            # VLLM_ENABLE_V1_MULTIPROCESSING=0
-            executor = self.llm.llm_engine.model_executor
-            return executor.apply_model(func)
-
-        # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
-        def _apply_model(self):
-            return func(self.get_model())
-
-        return self.llm.llm_engine.collective_rpc(_apply_model)
+        return self.llm.apply_model(func)

    def get_llm(self) -> LLM:
        return self.llm
@ -1210,7 +1079,7 @@ def dummy_llava_path():
                          local_dir=_dummy_llava_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
-                              "*.msgpack"
+                              "*.msgpack", "*.safetensors"
                          ])
        assert os.path.exists(json_path)
        with open(json_path) as f:
@ -1229,7 +1098,7 @@ def dummy_gemma2_embedding_path():
                          local_dir=_dummy_gemma2_embedding_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
-                              "*.msgpack"
+                              "*.msgpack", "*.safetensors"
                          ])
        assert os.path.exists(json_path)
        with open(json_path) as f:
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@ -32,10 +32,6 @@ def _test_stopping(llm: LLM,
    assert output.stop_reason == expected_reason


-def _set_async_mode(llm, is_async):
-    llm.llm_engine.scheduler[0].use_async_output_proc = is_async
-
-
 def _stop_basic(llm):
    _test_stopping(llm,
                   stop=["."],
@ -103,40 +99,8 @@ def test_stop_strings():
    # async output processing below.
    llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)

-    if envs.VLLM_USE_V1:
-        _stop_basic(llm)
-    else:
-        _set_async_mode(llm, True)
-        _stop_basic(llm)
-
-        _set_async_mode(llm, False)
-        _stop_basic(llm)
-
-    if envs.VLLM_USE_V1:
-        _stop_multi_tokens(llm)
-    else:
-        _set_async_mode(llm, True)
-        _stop_multi_tokens(llm)
-
-        _set_async_mode(llm, False)
-        _stop_multi_tokens(llm)
-
-    if envs.VLLM_USE_V1:
-        _stop_partial_token(llm)
-    else:
-        _set_async_mode(llm, True)
-        _stop_partial_token(llm)
-
-        _set_async_mode(llm, False)
-        _stop_partial_token(llm)
-
-    if envs.VLLM_USE_V1:
-        # FIXME: this does not respect include_in_output=False
-        # _stop_token_id(llm)
-        pass
-    else:
-        _set_async_mode(llm, True)
-        _stop_token_id(llm)
-
-        _set_async_mode(llm, False)
-        _stop_token_id(llm)
+    _stop_basic(llm)
+    _stop_multi_tokens(llm)
+    _stop_partial_token(llm)
+    # FIXME: this does not respect include_in_output=False
+    # _stop_token_id(llm)
--- a/tests/distributed/test_nccl_symm_mem_allreduce.py
+++ b/tests/distributed/test_nccl_symm_mem_allreduce.py
@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+import typing
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+import vllm.envs as envs
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.distributed.device_communicators.cuda_communicator import (
+    CudaCommunicator)
+from vllm.distributed.device_communicators.pynccl import (
+    register_nccl_symmetric_ops)
+from vllm.distributed.device_communicators.pynccl_allocator import (
+    get_nccl_mem_pool, is_symmetric_memory_enabled)
+from vllm.distributed.parallel_state import (get_tp_group,
+                                             init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+torch.manual_seed(42)
+random.seed(44)
+
+test_size_elements = 4 * 1024 * 1024
+
+
+def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
+    monkeypatch = pytest.MonkeyPatch()
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        dtype = torch.bfloat16
+        device = torch.device(f"cuda:{local_rank}")
+        torch.cuda.set_device(device)
+        torch.set_default_device(device)
+        torch.set_default_dtype(dtype)
+        update_environment_variables({
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        })
+
+        init_distributed_environment()
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+        cuda_communicator = typing.cast(CudaCommunicator,
+                                        get_tp_group().device_communicator)
+        pynccl_comm = cuda_communicator.pynccl_comm
+        if get_nccl_mem_pool() is None:
+            pytest.skip("NCCL allocator compilation failed "
+                        "(probably missing NCCL headers).")
+        if not is_symmetric_memory_enabled():
+            pytest.skip("NCCL symmetric memory allreduce is disabled.")
+
+        register_nccl_symmetric_ops(pynccl_comm)
+        input = torch.randint(1,
+                              23, (test_size_elements, ),
+                              dtype=dtype,
+                              device=device)
+        input_clone = input.clone()
+        output = torch.ops.vllm.all_reduce_symmetric_with_copy(input)
+        assert output is not None
+
+        group = get_tp_group().device_group
+        dist.all_reduce(input_clone, group=group)
+        torch.testing.assert_close(output, input_clone, atol=2.5, rtol=0.1)
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="NCCLSymmMemAllreduce is only available for CUDA platforms.",
+)
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
+    if world_size > torch.cuda.device_count():
+        pytest.skip("Not enough GPUs to run the test.")
+
+    # Enable SymmMemCommunicator
+    monkeypatch.setenv("VLLM_USE_NCCL_SYMM_MEM", "1")
+    monkeypatch.setenv("NCCL_NVLS_ENABLE", "1")
+    monkeypatch.setenv("NCCL_CUMEM_ENABLE", "1")
+
+    mp.spawn(nccl_symm_mem_allreduce_worker,
+             args=(world_size, ),
+             nprocs=world_size)
+    cleanup_dist_env_and_memory()
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional

 import pytest

-from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
+from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import get_config

@ -382,7 +382,6 @@ def test_tp_language_generation(
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    pytest.skip("Skipping the test until V1 passes it.")
    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
@ -410,7 +409,6 @@ def test_tp_language_embedding(
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    pytest.skip("Skipping the test until V1 passes it.")
    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
@ -438,7 +436,6 @@ def test_tp_multimodal_generation(
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    pytest.skip("Skipping the test until V1 passes it.")
    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
--- a/tests/distributed/test_symm_mem_allreduce.py
+++ b/tests/distributed/test_symm_mem_allreduce.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import queue
 import random
 import typing

@ -10,26 +11,31 @@ import torch.distributed as dist
 import torch.multiprocessing as mp

 import vllm.envs as envs
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.device_communicators.cuda_communicator import (
    CudaCommunicator)
-from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
-                                             get_tp_group,
+from vllm.distributed.parallel_state import (get_tp_group,
                                             init_distributed_environment,
                                             initialize_model_parallel)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
 from vllm.platforms import current_platform
 from vllm.utils import update_environment_variables

 torch.manual_seed(42)
 random.seed(44)

-test_size_elements = 4 * 1024 * 1024
+test_size_elements = 1024 * 1024


-def symm_mem_allreduce_worker(local_rank: int, world_size: int):
+def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
    monkeypatch = pytest.MonkeyPatch()
-    with monkeypatch.context() as m:
+    config = VllmConfig(parallel_config=ParallelConfig(
+        tensor_parallel_size=world_size))
+
+    with monkeypatch.context() as m, set_current_vllm_config(config):
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        dtype = torch.bfloat16
        device = torch.device(f"cuda:{local_rank}")
@ -51,22 +57,26 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int):
                                        get_tp_group().device_communicator)
        symm_mem_comm = cuda_communicator.symm_mem_comm
        if symm_mem_comm is None or symm_mem_comm.disabled:
-            pytest.skip("SymmMemCommunicator is not available or disabled.")
+            # can't use skip under multiprocessing
+            q.put("SymmMemCommunicator is not available or disabled.")
+            return

        inp_direct_symm_mem = torch.randint(1,
                                            23, (test_size_elements, ),
                                            dtype=dtype,
                                            device=device)
        if not symm_mem_comm.should_use_symm_mem(inp_direct_symm_mem):
-            pytest.skip(
+            # can't use skip under multiprocessing
+            q.put(
                "SymmMemCommunicator isn't used for this world and input size."
            )
+            return

        original_inp_direct_symm_mem = inp_direct_symm_mem.clone()
        out_direct_symm_mem = symm_mem_comm.all_reduce(inp_direct_symm_mem)
        assert out_direct_symm_mem is not None

-        group = get_tensor_model_parallel_group().device_group
+        group = get_tp_group().device_group
        dist.all_reduce(original_inp_direct_symm_mem, group=group)
        torch.testing.assert_close(out_direct_symm_mem,
                                   original_inp_direct_symm_mem,
@ -100,9 +110,34 @@ def test_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
+    q = mp.get_context('spawn').Queue()
+    mp.spawn(symm_mem_allreduce_worker,
+             args=(world_size, q),
+             nprocs=world_size)
+    try:
+        val = q.get(timeout=1)
+    except queue.Empty:
+        val = None
+    finally:
+        cleanup_dist_env_and_memory()
+        if val is not None:
+            pytest.skip(val)

-    # Enable SymmMemCommunicator
-    monkeypatch.setenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1")

-    mp.spawn(symm_mem_allreduce_worker, args=(world_size, ), nprocs=world_size)
-    cleanup_dist_env_and_memory()
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="SymmMemAllreduce is only available for CUDA platforms.")
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+def test_dp_with_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch):
+    world_size = 4
+    if world_size > torch.cuda.device_count():
+        pytest.skip("Not enough GPUs to run the test.")
+    # Verify that the DataParallel runs without error
+    engine_args = EngineArgs(model="distilbert/distilgpt2",
+                             enforce_eager=True,
+                             enable_prefix_caching=True,
+                             data_parallel_size=2,
+                             tensor_parallel_size=2,
+                             data_parallel_backend="mp")
+    LLMEngine.from_engine_args(engine_args)
--- a/tests/distributed/test_torchrun_example_moe.py
+++ b/tests/distributed/test_torchrun_example_moe.py
@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# unit test for `examples/offline_inference/torchrun_example.py`
+import os
+import random
+
+import torch.distributed as dist
+
+from vllm import LLM, SamplingParams
+from vllm.distributed.parallel_state import get_tp_group, get_world_group
+
+dist.init_process_group(backend="gloo")
+
+# Create prompts
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+] * 10
+dp_size = int(os.getenv("DP_SIZE", "1"))
+dp_rank = int(os.getenv("DP_RANK", "0"))
+
+if dp_size > 1:
+    # distribute the prompts across the data parallel ranks
+    prompts = [
+        prompt for idx, prompt in enumerate(prompts)
+        if idx % dp_size == dp_rank
+    ]
+
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# to test if all ranks agree on the same kv cache configuration.
+llm = LLM(model="microsoft/Phi-mini-MoE-instruct",
+          tensor_parallel_size=int(os.getenv("TP_SIZE", "1")),
+          pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
+          enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
+          distributed_executor_backend="external_launcher",
+          gpu_memory_utilization=random.uniform(0.7, 0.9),
+          swap_space=random.randint(1, 4),
+          seed=0)
+
+outputs = llm.generate(prompts, sampling_params)
+
+group = get_world_group() if dp_size == 1 else get_tp_group()
+cpu_group = group.cpu_group
+group_rank = dist.get_rank(group=cpu_group)
+
+
+def test_consistent_across_ranks(obj):
+    if group_rank == 0:
+        dist.broadcast_object_list([obj], src=group.ranks[0], group=cpu_group)
+    else:
+        container = [None]
+        dist.broadcast_object_list(container,
+                                   src=group.ranks[0],
+                                   group=cpu_group)
+        assert container[0] == obj
+
+
+test_consistent_across_ranks(
+    llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
+test_consistent_across_ranks(
+    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+
+# make sure we can access the model parameters from the calling process
+# of the `LLM` instance.
+params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
+              model.parameters())
+test_consistent_across_ranks(len(params))
+
+# all ranks should have the same outputs
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    test_consistent_across_ranks(prompt)
+    test_consistent_across_ranks(generated_text)
+    print(f"Rank {group_rank}, Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@ -25,12 +25,6 @@ TOKEN_IDS = [
 ]


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
@pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@ -6,14 +6,6 @@ import pytest
 from vllm import LLM


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_empty_prompt():
    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
    with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import datetime
 from typing import Union

 import openai  # use the official client for correctness check
@ -284,3 +285,62 @@ async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str,
                output.extend(chunk.choices[0].delta.tool_calls)
        for o in output:
            assert o.id is None or o.id == 'functions.get_current_weather:0'
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("arguments", ["{}", ''])
+async def test_no_args_tool_call(client: openai.AsyncOpenAI, model_name: str,
+                                 arguments: str):
+    # Step 1: Define a tool that requires no parameters
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_current_time",
+            "description":
+            "Get the current date and time. No parameters needed.",
+            "parameters": {
+                "type": "object",
+                "properties": {},  # No parameters
+                "required": []  # No required fields
+            }
+        }
+    }]
+    messages = [{"role": "user", "content": "What time is it now?"}]
+    # Step 2: Send user message and let model decide whether to call the tool
+    response = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto"  # Let model choose automatically
+    )
+
+    # Step 3: Check if model wants to call a tool
+    message = response.choices[0].message
+    if message.tool_calls:
+        # Get the first tool call
+        tool_call = message.tool_calls[0]
+        tool_name = tool_call.function.name
+        # Step 4: Execute the tool locally (no parameters)
+        if tool_name == "get_current_time":
+            # Test both empty string and "{}" for no-arg tool calls
+            tool_call.function.arguments = arguments
+            messages.append(message)
+            current_time = datetime.datetime.now()
+            result = current_time.isoformat()
+            messages.append({
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "content": result,
+            })
+            # Step 5: Send tool result back to model to continue conversation
+            final_response = await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+            )
+            # Output final natural language response
+            assert final_response.choices[0].message.content is not None
+
+    else:
+        # No tool called — just print model's direct reply
+        assert message.content is not None
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@ -14,9 +14,6 @@ from transformers import AutoConfig

 from ...utils import RemoteOpenAIServer

-pytest.skip("Skipping prompt_embeds test until V1 supports it.",
-            allow_module_level=True)
-
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

@ -36,7 +33,6 @@ def default_server_args() -> list[str]:
        "--enforce-eager",
        # Prompt Embeds server args
        "--enable-prompt-embeds",
-        "--no-enable-chunked-prefill",
    ]


@ -64,6 +60,7 @@ def create_dummy_embeds(num_tokens: int = 5) -> str:
    return base64.b64encode(buffer.getvalue()).decode('utf-8')


+@pytest.mark.skip("This test is skipped because it is flaky.")
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_completions_with_prompt_embeds(
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@ -5,7 +5,7 @@ from contextlib import suppress
 from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import Optional
-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock

 import pytest

@ -83,20 +83,31 @@ def register_mock_resolver():
 def mock_serving_setup():
    """Provides a mocked engine and serving completion instance."""
    mock_engine = MagicMock(spec=AsyncLLM)
-    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

-    def mock_add_lora_side_effect(lora_request: LoRARequest):
+    tokenizer = get_tokenizer(MODEL_NAME)
+    mock_engine.get_tokenizer = AsyncMock(return_value=tokenizer)
+
+    async def mock_add_lora_side_effect(lora_request: LoRARequest):
        """Simulate engine behavior when adding LoRAs."""
        if lora_request.lora_name == "test-lora":
            # Simulate successful addition
-            return
-        elif lora_request.lora_name == "invalid-lora":
+            return True
+        if lora_request.lora_name == "invalid-lora":
            # Simulate failure during addition (e.g. invalid format)
            raise ValueError(f"Simulated failure adding LoRA: "
                             f"{lora_request.lora_name}")
+        return True
+
+    mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect)
+
+    async def mock_generate(*args, **kwargs):
+        for _ in []:
+            yield _
+
+    mock_engine.generate = MagicMock(spec=AsyncLLM.generate,
+                                     side_effect=mock_generate)

-    mock_engine.add_lora.side_effect = mock_add_lora_side_effect
    mock_engine.generate.reset_mock()
    mock_engine.add_lora.reset_mock()

@ -131,7 +142,7 @@ async def test_serving_completion_with_lora_resolver(mock_serving_setup,
    with suppress(Exception):
        await serving_completion.create_completion(req_found)

-    mock_engine.add_lora.assert_called_once()
+    mock_engine.add_lora.assert_awaited_once()
    called_lora_request = mock_engine.add_lora.call_args[0][0]
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == lora_model_name
@ -157,7 +168,7 @@ async def test_serving_completion_resolver_not_found(mock_serving_setup,

    response = await serving_completion.create_completion(req)

-    mock_engine.add_lora.assert_not_called()
+    mock_engine.add_lora.assert_not_awaited()
    mock_engine.generate.assert_not_called()

    assert isinstance(response, ErrorResponse)
@ -181,7 +192,7 @@ async def test_serving_completion_resolver_add_lora_fails(
    response = await serving_completion.create_completion(req)

    # Assert add_lora was called before the failure
-    mock_engine.add_lora.assert_called_once()
+    mock_engine.add_lora.assert_awaited_once()
    called_lora_request = mock_engine.add_lora.call_args[0][0]
    assert isinstance(called_lora_request, LoRARequest)
    assert called_lora_request.lora_name == invalid_model
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@ -432,7 +432,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
            "--port",
            port,
        ],
-                                env={"VLLM_USE_V1": "1" if use_v1 else "0"})
+                                env={"VLLM_USE_V1": "1"})

        def is_server_up(url):
            try:
--- a/tests/entrypoints/openai/test_response_api_mcp_tools.py
+++ b/tests/entrypoints/openai/test_response_api_mcp_tools.py
@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/gpt-oss-20b"
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module")
+def mcp_disabled_server(monkeypatch_module: pytest.MonkeyPatch):
+    args = ["--enforce-eager", "--tool-server", "demo"]
+
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+        m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest.fixture(scope="function")
+def mcp_enabled_server(monkeypatch_module: pytest.MonkeyPatch):
+    args = ["--enforce-eager", "--tool-server", "demo"]
+
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+        m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
+        m.setenv("GPT_OSS_SYSTEM_TOOL_MCP_LABELS",
+                 "code_interpreter,container")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest_asyncio.fixture
+async def mcp_disabled_client(mcp_disabled_server):
+    async with mcp_disabled_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest_asyncio.fixture
+async def mcp_enabled_client(mcp_enabled_server):
+    async with mcp_enabled_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
+async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI,
+                                         model_name: str):
+    response = await mcp_enabled_client.responses.create(
+        model=model_name,
+        # TODO: Ideally should be able to set max tool calls
+        # to prevent multi-turn, but it is not currently supported
+        # would speed up the test
+        input=("What's the first 4 digits after the decimal point of "
+               "cube root of `19910212 * 20250910`? "
+               "Show only the digits. The python interpreter is not stateful "
+               "and you must print to see the output."),
+        tools=[{
+            "type": "mcp",
+            "server_label": "code_interpreter",
+            # URL unused for DemoToolServer
+            "server_url": "http://localhost:8888"
+        }],
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert response.usage.output_tokens_details.tool_output_tokens > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
+async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI,
+                                          model_name: str):
+    response = await mcp_disabled_client.responses.create(
+        model=model_name,
+        # TODO: Ideally should be able to set max tool calls
+        # to prevent multi-turn, but it is not currently supported
+        # would speed up the test
+        input=("What's the first 4 digits after the decimal point of "
+               "cube root of `19910212 * 20250910`? "
+               "Show only the digits. The python interpreter is not stateful "
+               "and you must print to see the output."),
+        tools=[{
+            "type": "mcp",
+            "server_label": "code_interpreter",
+            # URL unused for DemoToolServer
+            "server_url": "http://localhost:8888"
+        }],
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert response.usage.output_tokens_details.tool_output_tokens == 0
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -287,6 +287,57 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
    assert response3.status == "completed"


+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_streaming_types(client: OpenAI, model_name: str):
+    prompts = [
+        "tell me a story about a cat in 20 words",
+    ]
+
+    # this links the "done" type with the "start" type
+    # so every "done" type should have a corresponding "start" type
+    # and every open block should be closed by the end of the stream
+    pairs_of_event_types = {
+        "response.completed": "response.created",
+        "response.output_item.done": "response.output_item.added",
+        "response.content_part.done": "response.content_part.added",
+        "response.output_text.done": "response.output_text.delta",
+        "response.web_search_call.done": "response.web_search_call.added",
+        "response.reasoning_text.done": "response.reasoning_text.delta",
+        "response.reasoning_part.done": "response.reasoning_part.added",
+    }
+
+    for prompt in prompts:
+        response = await client.responses.create(
+            model=model_name,
+            input=prompt,
+            reasoning={"effort": "low"},
+            tools=[],
+            stream=True,
+            background=False,
+        )
+
+        stack_of_event_types = []
+        async for event in response:
+            if event.type == 'response.created':
+                stack_of_event_types.append(event.type)
+            elif event.type == 'response.completed':
+                assert stack_of_event_types[-1] == pairs_of_event_types[
+                    event.type]
+                stack_of_event_types.pop()
+            if event.type.endswith("added"):
+                stack_of_event_types.append(event.type)
+            elif event.type.endswith("delta"):
+                if stack_of_event_types[-1] == event.type:
+                    continue
+                stack_of_event_types.append(event.type)
+            elif event.type.endswith("done"):
+                assert stack_of_event_types[-1] == pairs_of_event_types[
+                    event.type]
+                stack_of_event_types.pop()
+        assert len(stack_of_event_types) == 0
+
+
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("background", [True, False])
@ -343,7 +394,10 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
                assert event.item_id == current_item_id

            # verify content_index_id is correct
-            if event.type == "response.content_part.added":
+            if event.type in [
+                    "response.content_part.added",
+                    "response.reasoning_part.added"
+            ]:
                assert event.content_index != current_content_index
                current_content_index = event.content_index
            elif event.type in [
@ -400,7 +454,13 @@ async def test_web_search(client: OpenAI, model_name: str):
 async def test_code_interpreter(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
-        input="Multiply 64548*15151 using builtin python interpreter.",
+        # TODO: Ideally should be able to set max tool calls
+        # to prevent multi-turn, but it is not currently supported
+        # would speed up the test
+        input=("What's the first 4 digits after the decimal point of "
+               "cube root of `19910212 * 20250910`? "
+               "Show only the digits. The python interpreter is not stateful "
+               "and you must print to see the output."),
        tools=[{
            "type": "code_interpreter",
            "container": {
@ -410,6 +470,7 @@ async def test_code_interpreter(client: OpenAI, model_name: str):
    )
    assert response is not None
    assert response.status == "completed"
+    assert response.usage.output_tokens_details.tool_output_tokens > 0


 def get_weather(latitude, longitude):
@ -461,6 +522,8 @@ async def test_function_calling(client: OpenAI, model_name: str):
        model=model_name,
        input="What's the weather like in Paris today?",
        tools=tools,
+        temperature=0.0,
+        extra_body={"request_id": "test_function_calling_non_resp"},
    )
    assert response is not None
    assert response.status == "completed"
@ -689,3 +752,18 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
    assert response_2 is not None
    assert response_2.status == "completed"
    assert response_2.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_output_messages_enabled(client: OpenAI, model_name: str,
+                                       server):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the capital of South Korea?",
+        extra_body={"enable_response_messages": True})
+
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.input_messages) > 0
+    assert len(response.output_messages) > 0
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -194,6 +194,7 @@ async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI,
    assert tc.function is not None and tc.function.name == "get_current_weather"
    args1 = tc.function.arguments
    assert args1 is not None and len(args1) > 0
+    assert not first_msg.content

    messages.append({"role": "assistant", "content": args1})
    messages.append({
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@ -5,6 +5,11 @@ import json

 import pytest

+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import (
+    Hermes2ProToolParser)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
 from ....utils import RemoteOpenAIServer

 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
@ -37,7 +42,7 @@ TOOLS = [{
                },
                "unit": {
                    "type": "string",
-                    "enum": ["celsius", "fahrenheit"]
+                    "enum": ["celsius", "fahrenheit"],
                },
            },
            "required": ["location"],
@ -45,8 +50,39 @@ TOOLS = [{
    },
 }]

+PRODUCT_TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "get_product_info",
+        "description": "Get detailed information of a product based on its "
+        "product ID.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "inserted": {
+                    "type": "boolean",
+                    "description": "inserted.",
+                },
+                "product_id": {
+                    "type": "integer",
+                    "description": "The product ID of the product.",
+                },
+            },
+            "required": ["product_id", "inserted"],
+        },
+    },
+}]
+
 MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}]

+PRODUCT_MESSAGES = [{
+    "role":
+    "user",
+    "content":
+    "Hi! Do you have any detailed information about the product id "
+    "7355608 and inserted true?",
+}]
+

@pytest.mark.asyncio
 async def test_non_streaming_tool_call():
@ -113,8 +149,8 @@ async def test_streaming_tool_call():
                if tool_chunk.function.name:
                    tool_call_chunks[index]["name"] += tool_chunk.function.name
                if tool_chunk.function.arguments:
-                    tool_call_chunks[index][
-                        "arguments"] += tool_chunk.function.arguments
+                    tool_call_chunks[index]["arguments"] += (
+                        tool_chunk.function.arguments)

        assert len(tool_call_chunks) == 1
        reconstructed_tool_call = tool_call_chunks[0]
@ -127,3 +163,295 @@ async def test_streaming_tool_call():
        print("\n[Streaming Test Passed]")
        print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
        print(f"Reconstructed Arguments: {arguments}")
+
+
+@pytest.mark.asyncio
+async def test_non_streaming_product_tool_call():
+    """Test tool call integer and boolean parameters in non-streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        response = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=PRODUCT_MESSAGES,
+            tools=PRODUCT_TOOLS,
+            tool_choice="auto",
+            temperature=0.66,
+        )
+
+        assert response.choices
+        choice = response.choices[0]
+        message = choice.message
+
+        assert choice.finish_reason == "tool_calls"
+        assert message.tool_calls is not None
+
+        tool_call = message.tool_calls[0]
+        assert tool_call.type == "function"
+        assert tool_call.function.name == "get_product_info"
+
+        arguments = json.loads(tool_call.function.arguments)
+        assert "product_id" in arguments
+        assert "inserted" in arguments
+
+        product_id = arguments.get("product_id")
+        inserted = arguments.get("inserted")
+
+        assert isinstance(product_id, int)
+        assert product_id == 7355608
+        assert isinstance(inserted, bool)
+        assert inserted is True
+
+        print("\n[Non-Streaming Product Test Passed]")
+        print(f"Tool Call: {tool_call.function.name}")
+        print(f"Arguments: {arguments}")
+
+
+@pytest.mark.asyncio
+async def test_streaming_product_tool_call():
+    """Test tool call integer and boolean parameters in streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        stream = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=PRODUCT_MESSAGES,
+            tools=PRODUCT_TOOLS,
+            tool_choice="auto",
+            temperature=0.66,
+            stream=True,
+        )
+
+        tool_call_chunks = {}
+        async for chunk in stream:
+            if not chunk.choices:
+                continue
+
+            delta = chunk.choices[0].delta
+            if not delta or not delta.tool_calls:
+                continue
+
+            for tool_chunk in delta.tool_calls:
+                index = tool_chunk.index
+                if index not in tool_call_chunks:
+                    tool_call_chunks[index] = {"name": "", "arguments": ""}
+
+                if tool_chunk.function.name:
+                    tool_call_chunks[index]["name"] += tool_chunk.function.name
+                if tool_chunk.function.arguments:
+                    tool_call_chunks[index]["arguments"] += (
+                        tool_chunk.function.arguments)
+
+        assert len(tool_call_chunks) == 1
+        reconstructed_tool_call = tool_call_chunks[0]
+
+        assert reconstructed_tool_call["name"] == "get_product_info"
+
+        arguments = json.loads(reconstructed_tool_call["arguments"])
+        assert "product_id" in arguments
+        assert "inserted" in arguments
+
+        # Handle type coercion for streaming test as well
+        product_id = arguments.get("product_id")
+        inserted = arguments.get("inserted")
+
+        assert isinstance(product_id, int)
+        assert product_id == 7355608
+        assert isinstance(inserted, bool)
+        assert inserted is True
+
+        print("\n[Streaming Product Test Passed]")
+        print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
+        print(f"Reconstructed Arguments: {arguments}")
+
+
+@pytest.fixture
+def qwen_tokenizer() -> AnyTokenizer:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+
+    return get_tokenizer("Qwen/Qwen3-32B")
+
+
+@pytest.fixture
+def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser:
+    return Hermes2ProToolParser(qwen_tokenizer)
+
+
+@pytest.fixture
+def any_chat_request() -> ChatCompletionRequest:
+    return ChatCompletionRequest(
+        seed=42,
+        model="Qwen/Qwen3-32B",
+        messages=[],
+    )
+
+
+def test_hermes_parser_streaming_just_forward_text(
+    qwen_tokenizer: AnyTokenizer,
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = (
+        """This is some prior text that has nothing to do with tool calling."""
+    )
+    tokens = qwen_tokenizer.encode(text)
+    previous_text = ""
+    delta_messages = []
+    for token in tokens:
+        delta_text = qwen_tokenizer.decode([token])
+        current_text = previous_text + delta_text
+        delta = hermes_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=any_chat_request,
+        )
+        previous_text = current_text
+        delta_messages.append(delta)
+
+    for delta in delta_messages:
+        assert delta is not None
+        assert not delta.tool_calls
+
+    print(delta_messages)
+    assert "".join([delta.content for delta in delta_messages]) == text
+
+
+def test_hermes_parser_streaming_failure_case_bug_19056(
+    qwen_tokenizer: AnyTokenizer,
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}}
+</tool_call>"""
+    tokens = qwen_tokenizer.encode(text)
+    previous_text = ""
+    delta_messages = []
+    for token in tokens:
+        text = qwen_tokenizer.decode([token])
+        current_text = previous_text + text
+        delta = hermes_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=any_chat_request,
+        )
+        previous_text = current_text
+        if delta is not None:
+            delta_messages.append(delta)
+
+    assert delta_messages[0].tool_calls[0].function.name == "final_answer"
+    tool_call_args = "".join(delta.tool_calls[0].function.arguments or ""
+                             for delta in delta_messages)
+    assert tool_call_args == '{"trigger": true}'
+
+
+def test_hermes_parser_streaming(
+    qwen_tokenizer: AnyTokenizer,
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = '<tool_call>\
+{"name": "get_current_temperature",\
+"arguments": {"location":\
+"San Francisco, California, United States", "unit": "celsius"}}\
+</tool_call>'
+
+    tokens = qwen_tokenizer.encode(text)
+    previous_text = ""
+    delta_messages = []
+    for token in tokens:
+        text = qwen_tokenizer.decode([token])
+        current_text = previous_text + text
+        delta = hermes_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=any_chat_request,
+        )
+        previous_text = current_text
+        if delta is not None:
+            delta_messages.append(delta)
+    print(delta_messages)
+    assert (delta_messages[0].tool_calls[0].function.name ==
+            "get_current_temperature")
+    tool_call_args = "".join(delta.tool_calls[0].function.arguments or ""
+                             for delta in delta_messages)
+    assert tool_call_args == (
+        '{"location":"San Francisco, California, United States", '
+        '"unit": "celsius"}')
+
+
+def test_hermes_parser_non_streaming_no_tool_call(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """This is not a tool call."""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert not tool_call.tools_called
+
+
+def test_hermes_parser_non_streaming_tool_call_between_tags(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}}
+</tool_call>"""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert tool_call.tools_called
+    assert tool_call.tool_calls[0].function.name == "final_answer"
+    assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'
+
+
+def test_hermes_parser_non_streaming_tool_call_until_eos(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}}"""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert tool_call.tools_called
+    assert tool_call.tool_calls[0].function.name == "final_answer"
+    assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'
+
+
+def test_hermes_parser_non_streaming_tool_call_invalid_json(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    # Missing closing brace to trigger exception
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}"""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert not tool_call.tools_called
--- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py
@ -216,7 +216,7 @@ def server_with_chunked_processing():
        "--enforce-eager",
        "--max-model-len",
        "512",  # Set smaller max_model_len to trigger chunking mechanism
-        '--override-pooler-config',
+        '--pooler-config',
        ('{"pooling_type": "MEAN", "normalize": true, '
         '"enable_chunked_processing": true, "max_embed_len": 10000}'),
        "--gpu-memory-utilization",
--- a/tests/entrypoints/test_api_server_process_manager.py
+++ b/tests/entrypoints/test_api_server_process_manager.py
@ -60,7 +60,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
    global WORKER_RUNTIME_SECONDS
    WORKER_RUNTIME_SECONDS = 0.5

-    # Copy the args to avoid mutating the
+    # Copy the args to avoid mutating them
    args = api_server_args.copy()

    if not with_stats_update:
--- a/tests/evals/gsm8k/README.md
+++ b/tests/evals/gsm8k/README.md
@ -19,7 +19,7 @@ pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
 vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000

 # Run evaluation
-python tests/gsm8k/gsm8k_eval.py --port 8000
+python tests/evals/gsm8k/gsm8k_eval.py --port 8000
 ```

 ## Configuration Format
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@ -18,7 +18,7 @@ if not current_platform.is_rocm():
    from xformers import ops as xops
    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask

-    from vllm.attention.backends.xformers import _make_alibi_bias
+    from tests.kernels.utils import make_alibi_bias

 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
@ -429,8 +429,8 @@ def test_multi_query_kv_attention(
    alibi_bias = None
    if use_alibi:
        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
-        attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
-                                     seq_lens)
+        attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
+                                    seq_lens)
        output = torch.empty_like(query)
        start = 0
        # Dynamic sequence length not supported with custom attn_bias.
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@ -69,32 +69,23 @@ def generate_params():

@pytest.mark.parametrize("device, name, use_mla, block_size",
                         generate_params())
-@pytest.mark.parametrize("use_v1", [True, False])
 def test_env(
    device: str,
    name: str,
    use_mla: bool,
    block_size: int,
-    use_v1: bool,
    monkeypatch: pytest.MonkeyPatch,
 ):
    """Test attention backend selection with valid device-backend pairs."""
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, name)
        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")

-        if name == "FLASHINFER" and not use_v1:
-            pytest.skip("FlashInfer backend is only available on V1 engine")
-
        if device == "cpu":
-            if not use_v1:
-                pytest.skip("CPU backend only supports V1")
-
            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float16, None, block_size,
-                                           False)
+                backend = get_attn_backend(16, torch.float16, None, block_size)
            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"

        elif device == "hip":
@ -114,7 +105,6 @@ def test_env(
                                             torch.float16,
                                             None,
                                             block_size,
-                                             False,
                                             use_mla=use_mla)
                        assert f"The selected backend, {name}" in str(
                            exc_info.value)
@ -125,7 +115,6 @@ def test_env(
                                             torch.float16,
                                             None,
                                             block_size,
-                                             False,
                                             use_mla=use_mla)
                        assert f"The selected backend, {name}" in str(
                            exc_info.value)
@ -135,18 +124,16 @@ def test_env(
                                                   torch.float16,
                                                   None,
                                                   block_size,
-                                                   False,
                                                   use_mla=use_mla)
-                        expected = f"{name}_VLLM_V1" if use_v1 else name
+                        expected = f"{name}_VLLM_V1"
                        assert backend.get_name() == expected
                else:
                    backend = get_attn_backend(16,
                                               torch.float16,
                                               None,
                                               block_size,
-                                               False,
                                               use_mla=use_mla)
-                    expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+                    expected = "TRITON_ATTN_VLLM_V1"
                    assert backend.get_name() == expected

        elif device == "cuda":
@ -163,11 +150,7 @@ def test_env(
                    # - TRITON_MLA: fallback for other cases

                    if name == "CUTLASS_MLA":
-                        if not use_v1:
-                            # CUTLASS_MLA only supported on V1 engine
-                            pytest.skip(
-                                "CUTLASS_MLA only supported on V1 engine")
-                        elif block_size != 128:
+                        if block_size != 128:
                            # CUTLASS_MLA only supports block_size == 128
                            pytest.skip(
                                "CUTLASS_MLA only supports block_size 128")
@ -176,16 +159,11 @@ def test_env(
                                                       torch.float16,
                                                       None,
                                                       block_size,
-                                                       False,
                                                       use_mla=use_mla)
                            expected = "CUTLASS_MLA_VLLM_V1"
                            assert backend.get_name() == expected
                    elif name == "FLASHINFER_MLA":
-                        if not use_v1:
-                            # FlashInfer MLA only supported on V1 engine
-                            pytest.skip(
-                                "FlashInfer MLA only supported on V1 engine")
-                        elif block_size not in [32, 64]:
+                        if block_size not in [32, 64]:
                            # FlashInfer MLA only supports block_size 32 or 64
                            pytest.skip(
                                "FlashInfer MLA only supports block_size 32 "
@ -195,7 +173,6 @@ def test_env(
                                                       torch.float16,
                                                       None,
                                                       block_size,
-                                                       False,
                                                       use_mla=use_mla)
                            expected = "FLASHINFER_MLA"
                            assert backend.get_name() == expected
@ -204,7 +181,7 @@ def test_env(
                            # FlashMLA only supports block_size == 64
                            pytest.skip("FlashMLA only supports block_size 64")
                        else:
-                            from vllm.attention.backends.flashmla import (
+                            from vllm.v1.attention.backends.mla.flashmla import (  # noqa: E501
                                is_flashmla_supported)
                            is_supported, _ = is_flashmla_supported()
                            if not is_supported:
@ -215,93 +192,73 @@ def test_env(
                                                           torch.float16,
                                                           None,
                                                           block_size,
-                                                           False,
                                                           use_mla=use_mla)
-                                expected = f"{name}_VLLM_V1" if use_v1 else name
+                                expected = f"{name}_VLLM_V1"
                                assert backend.get_name() == expected
                    elif name == "FLASH_ATTN_MLA":
-                        if not use_v1:
-                            # FlashAttention MLA only supported on V1 engine
-                            pytest.skip(
-                                "FlashAttention MLA only supported on V1 engine"
-                            )
-                        else:
-                            backend = get_attn_backend(16,
-                                                       torch.float16,
-                                                       None,
-                                                       block_size,
-                                                       False,
-                                                       use_mla=use_mla)
-                            expected = "FLASH_ATTN_MLA"
-                            assert backend.get_name() == expected
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   None,
+                                                   block_size,
+                                                   use_mla=use_mla)
+                        expected = "FLASH_ATTN_MLA"
+                        assert backend.get_name() == expected
                    else:
                        # TRITON_MLA or other fallback
                        backend = get_attn_backend(16,
                                                   torch.float16,
                                                   None,
                                                   block_size,
-                                                   False,
                                                   use_mla=use_mla)
-                        expected = ("TRITON_MLA_VLLM_V1"
-                                    if use_v1 else "TRITON_MLA")
+                        expected = "TRITON_MLA_VLLM_V1"
                        assert backend.get_name() == expected
                elif name == "FLASHINFER":
                    backend = get_attn_backend(16,
                                               torch.float16,
                                               None,
                                               block_size,
-                                               False,
                                               use_mla=use_mla)
-                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
+                    expected = "FLASHINFER_VLLM_V1"
                    assert backend.get_name() == expected
                else:
                    backend = get_attn_backend(32,
                                               torch.float16,
                                               None,
                                               block_size,
-                                               False,
                                               use_mla=use_mla)
-                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                    expected = "FLASH_ATTN_VLLM_V1"
                    assert backend.get_name() == expected

-                    if use_v1:
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   None,
-                                                   block_size,
-                                                   False,
-                                                   use_mla=use_mla)
-                        assert backend.get_name() == "FLEX_ATTENTION", (
-                            "Should fallback to FlexAttention if head size is "
-                            "not supported by FlashAttention")
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               None,
+                                               block_size,
+                                               use_mla=use_mla)
+                    assert backend.get_name() == "FLEX_ATTENTION", (
+                        "Should fallback to FlexAttention if head size is "
+                        "not supported by FlashAttention")


@pytest.mark.parametrize("device", ["cpu", "cuda"])
-@pytest.mark.parametrize("use_v1", [True, False])
 def test_fp32_fallback(
    device: str,
-    use_v1: bool,
    monkeypatch: pytest.MonkeyPatch,
 ):
    """Test attention backend selection with fp32."""
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")

        if device == "cpu":
-            if not use_v1:
-                pytest.skip("CPU backend only supports V1")
-
            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16, False)
+                backend = get_attn_backend(16, torch.float32, None, 16)
            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"

        elif device == "cuda":
            with patch("vllm.attention.selector.current_platform",
                       CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16, False)
-            assert (backend.get_name() == "FLEX_ATTENTION"
-                    if use_v1 else "XFORMERS")
+                backend = get_attn_backend(16, torch.float32, None, 16)
+            assert backend.get_name() == "FLEX_ATTENTION"


 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
@ -316,29 +273,29 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
        monkeypatch.setattr(torch.cuda,
                            "get_device_capability",
                            lambda _=None: (7, 5))
-        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        backend = get_attn_backend(16, torch.float16, None, 16)
        assert backend.get_name() != STR_FLASH_ATTN_VAL

        # Reset the monkeypatch for subsequent tests
        monkeypatch.undo()

        # Unsupported data type
-        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16)
        assert backend.get_name() != STR_FLASH_ATTN_VAL

        # Unsupported kv cache data type
-        backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+        backend = get_attn_backend(16, torch.float16, "fp8", 16)
        assert backend.get_name() != STR_FLASH_ATTN_VAL

        # Unsupported block size
-        backend = get_attn_backend(16, torch.float16, None, 8, False)
+        backend = get_attn_backend(16, torch.float16, None, 8)
        assert backend.get_name() != STR_FLASH_ATTN_VAL

        # flash-attn is not installed
        import sys
        original_module = sys.modules.get('vllm_flash_attn')
        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
-        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        backend = get_attn_backend(16, torch.float16, None, 16)
        assert backend.get_name() != STR_FLASH_ATTN_VAL

        # Restore the original module if it existed
@ -349,23 +306,18 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)

        # Unsupported head size
-        backend = get_attn_backend(17, torch.float16, None, 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Attention-free models should bypass env and use PlaceholderAttention
-        backend = get_attn_backend(16, torch.float16, None, 16, True)
+        backend = get_attn_backend(17, torch.float16, None, 16)
        assert backend.get_name() != STR_FLASH_ATTN_VAL


-@pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
+def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
    """Test that invalid attention backend names raise ValueError."""
    with monkeypatch.context() as m, patch(
            "vllm.attention.selector.current_platform", CudaPlatform()):
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)

        # Should raise ValueError for invalid backend
        with pytest.raises(ValueError) as exc_info:
-            get_attn_backend(32, torch.float16, None, 16, False)
+            get_attn_backend(32, torch.float16, None, 16)
        assert "Invalid value 'INVALID'" in str(exc_info.value)
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@ -39,6 +39,8 @@ CUDA_DEVICES = [
 # We assume fp8 is always enabled for testing.
 KV_CACHE_DTYPE = ["auto", "fp8"]

+RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]
+

@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@ -223,6 +225,7 @@ def test_reshape_and_cache(
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@pytest.mark.parametrize("kv_cache_layout", CACHE_LAYOUTS)
+@pytest.mark.parametrize("implementation", RESHAPE_FLASH_IMPLEMENTATIONS)
@torch.inference_mode()
 def test_reshape_and_cache_flash(
    kv_cache_factory_flashinfer,
@ -236,9 +239,13 @@ def test_reshape_and_cache_flash(
    device: str,
    kv_cache_dtype: str,
    kv_cache_layout: str,
+    implementation: str,
 ) -> None:
    current_platform.seed_everything(seed)
    torch.set_default_device(device)
+    assert implementation in ["cuda", "triton"]
+    if implementation == "triton" and kv_cache_layout == "HND":
+        pytest.skip("Triton implementation only supports NHD layout.")

    # fp8 conversion requires continugous memory buffer. Reduce the number of
    # blocks and tokens to consume less memory.
@ -298,12 +305,20 @@ def test_reshape_and_cache_flash(
        cloned_key_cache = key_cache_compact.clone()
        cloned_value_cache = value_cache_compact.clone()
    # Call the reshape_and_cache kernel.
-    opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
-            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
-             k_scale, v_scale),
-            cond=(head_size == HEAD_SIZES[0]))
-    ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
-                                slot_mapping, kv_cache_dtype, k_scale, v_scale)
+    if implementation == "cuda":
+        opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
+                (key, value, key_cache, value_cache, slot_mapping,
+                 kv_cache_dtype, k_scale, v_scale),
+                cond=(head_size == HEAD_SIZES[0]))
+        ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
+                                    slot_mapping, kv_cache_dtype, k_scale,
+                                    v_scale)
+    elif implementation == "triton":
+        from vllm.attention.ops.triton_reshape_and_cache_flash import (
+            triton_reshape_and_cache_flash)
+        triton_reshape_and_cache_flash(key, value, key_cache, value_cache,
+                                       slot_mapping, kv_cache_dtype, k_scale,
+                                       v_scale)
    key_cache_compact = permute_and_compact(key_cache)
    value_cache_compact = permute_and_compact(value_cache)

--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@ -11,7 +11,7 @@ import torch
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask

-from vllm.attention.backends.xformers import _make_alibi_bias
+from tests.kernels.utils import make_alibi_bias
 from vllm.attention.ops.chunked_prefill_paged_decode import (
    chunked_prefill_paged_decode)
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
@ -470,7 +470,7 @@ def test_contexted_kv_attention_alibi(
    key = key.unsqueeze(0)
    value = value.unsqueeze(0)

-    attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
+    attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
    output_ref = torch.empty_like(output)
    seq_start = 0
    query_start = 0
@ -479,7 +479,7 @@ def test_contexted_kv_attention_alibi(
    # FIXME(DefTruth): Because xformers does not support dynamic sequence
    # lengths with custom attention bias, we process each prompt one by
    # one. This is inefficient, especially when we have many short prompts.
-    # modified from: vllm/attention/backends/xformers.py#L343
+    # modified from: vllm/v1/attention/backends/xformers.py#L343
    for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
        seq_end = seq_start + seq_len
        query_end = query_start + query_len
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@ -16,6 +16,7 @@ def clear_cache():
    _cached_get_attn_backend.cache_clear()


+@pytest.mark.skip(reason="Skipped for now. Should be revisited.")
 def test_selector(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@ -83,7 +83,7 @@ def ref_paged_attn(
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("sliding_window", [None, 64, 128, 256])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("soft_cap", [None, 50.0])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
--- a/Show More
+++ b/Show More