Merge branch 'main' into woosuk-jf

[Bugfix] Prioritize dtype in root config before checking text config (#17629 )
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-04 11:16:07 -07:00 · 2025-05-04 12:43:05 +00:00 · 2025-05-03 19:42:43 -07:00 · 2025-05-03 22:59:24 +00:00 · 2025-05-03 10:42:43 -07:00 · 2025-05-03 17:08:14 +00:00
497 changed files with 15622 additions and 4343 deletions
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,20 +1,20 @@
 steps:
-  - label: "Build wheel - CUDA 12.4"
+  - label: "Build wheel - CUDA 12.8"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  - label: "Build wheel - CUDA 12.1"
+  - label: "Build wheel - CUDA 12.6"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@ -48,7 +48,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Build and publish TPU release image"
@ -57,6 +57,8 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
+      - "yes | docker system prune -a"
+      - "git fetch --all"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -75,37 +75,51 @@ HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 #ignore certain kernels tests
-if [[ $commands == *" kernels "* ]]; then
+if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \
-  --ignore=kernels/test_attention_selector.py \
-  --ignore=kernels/test_blocksparse_attention.py \
-  --ignore=kernels/test_causal_conv1d.py \
-  --ignore=kernels/test_cutlass.py \
-  --ignore=kernels/test_encoder_decoder_attn.py \
-  --ignore=kernels/test_flash_attn.py \
-  --ignore=kernels/test_flashinfer.py \
-  --ignore=kernels/test_int8_quant.py \
-  --ignore=kernels/test_machete_gemm.py \
-  --ignore=kernels/test_mamba_ssm.py \
-  --ignore=kernels/test_marlin_gemm.py \
-  --ignore=kernels/test_moe.py \
-  --ignore=kernels/test_prefix_prefill.py \
-  --ignore=kernels/test_rand.py \
-  --ignore=kernels/test_sampler.py \
-  --ignore=kernels/test_cascade_flash_attn.py \
-  --ignore=kernels/test_mamba_mixer2.py \
-  --ignore=kernels/test_aqlm.py \
-  --ignore=kernels/test_machete_mm.py \
-  --ignore=kernels/test_mha_attn.py \
-  --ignore=kernels/test_block_fp8.py \
-  --ignore=kernels/test_cutlass_moe.py \
-  --ignore=kernels/test_mamba_ssm_ssd.py \
-  --ignore=kernels/test_attention.py \
-  --ignore=kernels/test_block_int8.py \
-  --ignore=kernels/test_fused_quant_layernorm.py \
-  --ignore=kernels/test_int8_kernel.py \
-  --ignore=kernels/test_triton_moe_ptpc_fp8.py \
-  --ignore=kernels/test_permute_cols.py"
+  --ignore=kernels/core/test_fused_quant_layernorm.py \
+  --ignore=kernels/core/test_permute_cols.py"
+fi
+
+if [[ $commands == *" kernels/attention"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/attention/stest_attention_selector.py \
+  --ignore=kernels/attention/test_blocksparse_attention.py \
+  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/test_flash_attn.py \
+  --ignore=kernels/attention/test_flashinfer.py \
+  --ignore=kernels/attention/test_prefix_prefill.py \
+  --ignore=kernels/attention/test_cascade_flash_attn.py \
+  --ignore=kernels/attention/test_mha_attn.py \
+  --ignore=kernels/attention/test_lightning_attn.py \
+  --ignore=kernels/attention/test_attention.py"
+fi
+
+if [[ $commands == *" kernels/quantization"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/quantization/test_aqlm.py \
+  --ignore=kernels/quantization/test_machete_mm.py \
+  --ignore=kernels/quantization/test_block_fp8.py \
+  --ignore=kernels/quantization/test_block_int8.py \
+  --ignore=kernels/quantization/test_marlin_gemm.py \
+  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
+  --ignore=kernels/quantization/test_int8_kernel.py"
+fi
+
+if [[ $commands == *" kernels/mamba"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/mamba/test_mamba_mixer2.py \
+  --ignore=kernels/mamba/test_causal_conv1d.py \
+  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+fi
+
+if [[ $commands == *" kernels/moe"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/moe/test_moe.py \
+  --ignore=kernels/moe/test_cutlass_moe.py \
+  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi

 #ignore certain Entrypoints/openai tests
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu128 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@ -66,12 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu128 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi

-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -39,7 +39,7 @@ steps:
  - pip install -r ../../requirements/docs.txt
  - SPHINXOPTS=\"-W\" make html
  # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/inference_params.html
+  - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html

 - label: Async Engine, Inputs, Utils, Worker Test # 24min
  source_file_dependencies:
@ -293,6 +293,7 @@ steps:
  parallelism: 4

 - label: PyTorch Compilation Unit Tests
+  torch_nightly: true
  source_file_dependencies:
    - vllm/
    - tests/compile
@ -302,6 +303,7 @@ steps:
    - pytest -v -s compile/test_sequence_parallelism.py

 - label: PyTorch Fullgraph Smoke Test # 9min
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@ -312,6 +314,7 @@ steps:
  - pytest -v -s compile/piecewise/test_toy_llama.py

 - label: PyTorch Fullgraph Test # 18min
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@ -319,6 +322,7 @@ steps:
  - pytest -v -s compile/test_full_graph.py

 - label: Kernels Core Operation Test
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
@ -326,6 +330,7 @@ steps:
    - pytest -v -s kernels/core

 - label: Kernels Attention Test %N
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
@ -336,6 +341,7 @@ steps:
  parallelism: 2

 - label: Kernels Quantization Test %N
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
@ -345,6 +351,7 @@ steps:
  parallelism: 2

 - label: Kernels MoE Test
+  #mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/moe/
  - tests/kernels/moe
@ -353,6 +360,7 @@ steps:
    - pytest -v -s kernels/moe

 - label: Kernels Mamba Test
+  #mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
@ -385,12 +393,13 @@ steps:
  commands:
  - pytest -v -s benchmarks/

- label: Quantization Test # 33min
+- label: Quantization Test
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
-  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  commands:
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@ -430,88 +439,85 @@ steps:
 #####  models test  #####

 - label: Basic Models Test # 24min
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_utils.py
+    - pytest -v -s models/test_vision.py
    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'

- label: Language Models Test (Standard) # 32min
+- label: Language Models Test (Standard)
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/language
  commands:
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
-    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/language -m core_model
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/language -m core_model

- label: Language Models Test (Extended) # 1h10min
+- label: Language Models Test (Extended)
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/language
  commands:
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
-    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/language -m 'not core_model'
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/language -m 'not core_model'

- label: Multi-Modal Models Test (Standard) # 40min
+- label: Multi-Modal Models Test (Standard)
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/audio_language
-  - tests/models/encoder_decoder/vision_language
+  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal
-    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/vision_language -m core_model
-    - pytest -v -s models/encoder_decoder/audio_language -m core_model
-    - pytest -v -s models/encoder_decoder/language -m core_model
-    - pytest -v -s models/encoder_decoder/vision_language -m core_model
-    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py
+    - pytest -v -s models/multimodal/processing
+    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
+    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

- label: Multi-Modal Models Test (Extended) 1 # 48m
+- label: Multi-Modal Models Test (Extended) 1
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/vision_language
+  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
+    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'

- label: Multi-Modal Models Test (Extended) 2 # 38m
+- label: Multi-Modal Models Test (Extended) 2
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/vision_language
+  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models Test (Extended) 3
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+- label: Quantized Models Test
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization

 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
@ -581,9 +587,8 @@ steps:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
  # test sequence parallel
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -21,12 +21,12 @@ body:
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      <details>
-      <summary>The output of `python collect_env.py`</summary>
+      <summary>The output of <code>python collect_env.py</code></summary>

      ```text
      Your output of `python collect_env.py` here
      ```
-      
+
      </details>
  validations:
    required: true
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -66,7 +66,7 @@ jobs:
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
-    
+
      - name: curl test
        run: |
          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
@ -79,4 +79,4 @@ jobs:
                          "max_tokens": 7,
                          "temperature": 0
                  }'):$CODE"
-          echo "$CODE"
+          echo "$CODE"
--- a/.gitignore
+++ b/.gitignore
@ -80,6 +80,7 @@ instance/
 # Sphinx documentation
 docs/_build/
 docs/source/getting_started/examples/
+docs/source/api/vllm

 # PyBuilder
 .pybuilder/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -12,29 +12,29 @@ repos:
  - id: yapf
    args: [--in-place, --verbose]
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.9.3
+  rev: v0.11.7
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.4.0
+  rev: v2.4.1
  hooks:
  - id: codespell
    additional_dependencies: ['tomli']
    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
-  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
+  rev: 6.0.1
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v19.1.7
+  rev: v20.1.3
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
 - repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.27
+  rev: v0.9.29
  hooks:
  - id: pymarkdown
    args: [fix]
@ -43,10 +43,10 @@ repos:
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.2
+  rev: 0.6.17
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX)

 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
-
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

@ -46,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")

 #
 # Try to find python package with an executable that exactly matches
@ -241,6 +240,7 @@ set(VLLM_EXT_SRC
  "csrc/quantization/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
+  "csrc/quantization/activation_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
  "csrc/custom_all_reduce.cu"
@ -249,9 +249,8 @@ set(VLLM_EXT_SRC
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

-  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  # Please keep this in sync with FetchContent_Declare line below.
-  set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use")
+  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
+  set(CUTLASS_REVISION "v3.9.1" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -269,7 +268,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
        # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG v3.9.0
+        GIT_TAG ${CUTLASS_REVISION}
        GIT_PROGRESS TRUE

        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -681,6 +680,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
 endif()

+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set(MOE_PERMUTE_SRC
+      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
+      "csrc/moe/moe_permute_unpermute_op.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${MARLIN_PERMUTE_SRC}"
+    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
+
+  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
+endif()
 message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
@ -689,6 +699,8 @@ define_gpu_extension_target(
  SOURCES ${VLLM_MOE_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)

--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@ -0,0 +1,212 @@
+#!/bin/bash
+
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
+# The current server parameter combination is  max_num_seqs and max_num_batched_tokens
+# It also supports additional requirement: e2e latency and prefix cache. 
+
+# Pre-requisite:
+# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. 
+# 2. If the model is customized, replace the MODEL's config with the customized config.
+# 3. Set variables (ALL REQUIRED)
+#   BASE: your directory for vllm repo
+#   MODEL: the model served by vllm
+#   DOWNLOAD_DIR: directory to download and load model weights.
+#   INPUT_LEN: request input len
+#   OUTPUT_LEN: request output len
+#   MIN_CACHE_HIT_PCT: prefix cache rate
+#   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
+# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
+# 5. The final result will be saved in RESULT file. 
+
+
+# Example use cases 
+# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
+# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
+# 3. If we want to reach 60% prefix cache, what's the best server parameter? 
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
+
+TAG=$(date +"%Y_%m_%d_%H_%M")
+BASE=""
+MODEL="meta-llama/Llama-3.1-8B-Instruct"
+DOWNLOAD_DIR=""
+INPUT_LEN=4000
+OUTPUT_LEN=16
+MIN_CACHE_HIT_PCT_PCT=0
+MAX_LATENCY_ALLOWED_MS=100000000000
+
+LOG_FOLDER="$BASE/auto-benchmark/$TAG"
+RESULT="$LOG_FOLDER/result.txt"
+
+echo "result file$ $RESULT"
+echo "model: $MODEL"
+echo
+
+rm -rf $LOG_FOLDER
+mkdir -p $LOG_FOLDER
+
+cd "$BASE/vllm"
+# create sonnet-4x.txt so that we can sample 2048 tokens for input
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+do
+cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done
+
+pip install datasets
+
+current_hash=$(git rev-parse HEAD)
+echo "hash:$current_hash" >> "$RESULT"
+echo "current_hash: $current_hash"
+
+best_throughput=0
+best_max_num_seqs=0
+best_num_batched_tokens=0
+best_goodput=0
+run_benchmark() {
+    local max_num_seqs=$1
+    local max_num_batched_tokens=$2
+    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    echo "vllm_log: $vllm_log"
+    echo
+    rm -f $vllm_log
+
+    # start the server
+    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
+        --disable-log-requests \
+        --port 8004 \
+        --gpu-memory-utilization 0.98 \
+        --max-num-seqs $max_num_seqs \
+        --max-num-batched-tokens $max_num_batched_tokens \
+        --tensor-parallel-size 1 \
+        --enable-prefix-caching \
+        --load-format dummy \
+        --download-dir $DOWNLOAD_DIR \
+        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
+    echo "wait for 10 minutes.."
+    echo
+    # wait for 10 minutes...
+    server_started=0
+    for i in {1..60}; do        
+        if grep -Fq "Application startup complete" "$vllm_log"; then
+            echo "Application started"
+            server_started=1
+            break
+        else
+            # echo "wait for 10 seconds..."
+            sleep 10
+        fi
+    done
+ 
+    if (( ! server_started )); then
+        echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
+        echo "pkill -f vllm"
+        echo
+        pkill vllm
+        sleep 10
+        return 1
+    fi
+    
+    echo "run benchmark test..."
+    echo
+    meet_latency_requirement=0
+    # get a basic qps by using request-rate inf
+    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
+    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
+    python benchmarks/benchmark_serving.py \
+        --backend vllm \
+        --model $MODEL  \
+        --dataset-name sonnet \
+        --dataset-path benchmarks/sonnet_4x.txt \
+        --sonnet-input-len $INPUT_LEN \
+        --sonnet-output-len $OUTPUT_LEN \
+        --ignore-eos \
+        --disable-tqdm \
+        --request-rate inf \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+        --num-prompts 100 \
+        --sonnet-prefix-len $prefix_len \
+        --port 8004 > "$bm_log"
+    through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+
+    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+        meet_latency_requirement=1
+    fi
+
+    if (( ! meet_latency_requirement )); then
+    # start from request-rate as int(through_put) + 1
+        request_rate=$((${through_put%.*} + 1))
+        while ((request_rate > 0)); do
+            # clear prefix cache
+            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+            sleep 5
+            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
+            python benchmarks/benchmark_serving.py \
+                --backend vllm \
+                --model $MODEL  \
+                --dataset-name sonnet \
+                --dataset-path benchmarks/sonnet_4x.txt \
+                --sonnet-input-len $INPUT_LEN \
+                --sonnet-output-len $OUTPUT_LEN \
+                --ignore_eos \
+                --disable-tqdm \
+                --request-rate $request_rate \
+                --percentile-metrics ttft,tpot,itl,e2el \
+                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+                --num-prompts 100 \
+                --sonnet-prefix-len $prefix_len \
+                --port 8004 > "$bm_log"
+            through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+                meet_latency_requirement=1
+                break
+            fi
+            request_rate=$((request_rate-1))
+        done
+    fi
+    # write the results and update the best result.
+    if ((meet_latency_requirement)); then
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
+            best_throughput=$through_put
+            best_max_num_seqs=$max_num_seqs
+            best_num_batched_tokens=$max_num_batched_tokens
+            best_goodput=$goodput
+        fi
+    else
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
+    fi
+
+    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
+
+    echo "pkill -f vllm"
+    echo
+    pkill vllm
+    sleep 10
+    rm -f $vllm_log
+    printf '=%.0s' $(seq 1 20)
+    return 0
+}
+
+
+num_seqs_list="128 256"
+num_batched_tokens_list="512 1024 2048 4096"
+for num_seqs in $num_seqs_list; do
+    for num_batched_tokens in $num_batched_tokens_list; do
+        run_benchmark $num_seqs $num_batched_tokens
+        exit 0
+    done
+done
+echo "finish permutations"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
+
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -201,6 +201,7 @@ async def async_request_deepspeed_mii(
                                     timeout=AIOHTTP_TIMEOUT) as session:

        payload = {
+            "model": request_func_input.model,
            "prompt": request_func_input.prompt,
            "max_tokens": request_func_input.output_len,
            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
@ -260,6 +261,7 @@ async def async_request_openai_completions(
                if request_func_input.model_name else request_func_input.model,
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
+            "repetition_penalty": 1.0,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -771,6 +771,60 @@ class InstructCoderDataset(HuggingFaceDataset):
        return sampled_requests


+# -----------------------------------------------------------------------------
+# MT-Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class MTBenchDataset(HuggingFaceDataset):
+    """
+    MT-Bench Dataset.
+    https://huggingface.co/datasets/philschmid/mt-bench
+
+    We create a single turn dataset for MT-Bench. 
+    This is similar to Spec decoding benchmark setup in vLLM
+    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
+    """ # noqa: E501
+
+    DEFAULT_OUTPUT_LEN = 256  # avg len used in SD bench in vLLM
+    SUPPORTED_DATASET_PATHS = {
+        "philschmid/mt-bench",
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item['turns'][0]
+
+            # apply template
+            prompt = tokenizer.apply_chat_template([{
+                "role": "user",
+                "content": prompt
+            }],
+                                                   add_generation_prompt=True,
+                                                   tokenize=False)
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # AIMO Dataset Implementation
 # -----------------------------------------------------------------------------
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -52,9 +52,9 @@ except ImportError:

 from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
                               ConversationDataset, HuggingFaceDataset,
-                               InstructCoderDataset, RandomDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
-                               VisionArenaDataset)
+                               InstructCoderDataset, MTBenchDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json

 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -595,6 +595,9 @@ def main(args: argparse.Namespace):
        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
            dataset_class = InstructCoderDataset
            args.hf_split = "train"
+        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = MTBenchDataset
+            args.hf_split = "train"
        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
            dataset_class = ConversationDataset
        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -123,6 +123,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                copy.deepcopy(schema) for _ in range(args.num_prompts)
            ]
            for i in range(len(json_schemas)):
+                if "properties" not in json_schemas[i]:
+                    json_schemas[i]["properties"] = {}
                json_schemas[i]["properties"][
                    f"__optional_field_{uuid.uuid4()}"] = {
                        "type":
@ -134,7 +136,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
            json_schemas = [schema] * args.num_prompts

        def gen_prompt(index: int):
-            return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
+            return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501

        def get_schema(index: int):
            return json_schemas[index % len(json_schemas)]
@ -231,7 +233,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                idx -= len_dataset
            schema = dataset["schema"][idx]
            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
-                                                   tokenize=False)
+                                                   tokenize=False,
+                                                   add_generation_prompt=True)
            input_len = len(tokenizer(prompt).input_ids)
            completion = dataset["completion"][idx]

@ -849,7 +852,7 @@ if __name__ == "__main__":
                            'json', 'json-unique', 'grammar', 'regex',
                            'choice', 'xgrammar_bench'
                        ])
-    parser.add_argument("--json_schema_path",
+    parser.add_argument("--json-schema-path",
                        type=str,
                        default=None,
                        help="Path to json schema.")
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -90,7 +90,8 @@ def bench_run(results: list[benchmark.Measurement], model: str,

    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)

-    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        a, score, topk, renormalize=False)

    def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
                       topk_weights: torch.Tensor, topk_ids: torch.Tensor,
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -115,8 +115,8 @@ def benchmark_config(config: BenchmarkConfig,
        from vllm.model_executor.layers.fused_moe import override_config
        with override_config(config):
            if use_deep_gemm:
-                topk_weights, topk_ids = fused_topk(x, input_gating, topk,
-                                                    False)
+                topk_weights, topk_ids, token_expert_indices = fused_topk(
+                    x, input_gating, topk, False)
                return fused_experts(
                    x,
                    w1,
@ -442,8 +442,14 @@ class BenchmarkWorker:
                                                   hidden_size, search_space,
                                                   is_fp16, topk)

-        with torch.cuda.device(self.device_id) if current_platform.is_rocm(
-        ) else nullcontext():
+        need_device_guard = False
+        if current_platform.is_rocm():
+            visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None)
+            if visible_device != f"{self.device_id}":
+                need_device_guard = True
+
+        with torch.cuda.device(
+                self.device_id) if need_device_guard else nullcontext():
            for config in tqdm(search_space):
                try:
                    kernel_time = benchmark_config(
@ -527,7 +533,7 @@ def get_weight_block_size_safety(config, default_value=None):

 def main(args: argparse.Namespace):
    print(args)
-    block_quant_shape = None
+
    config = AutoConfig.from_pretrained(
        args.model, trust_remote_code=args.trust_remote_code)
    if config.architectures[0] == "DbrxForCausalLM":
@ -546,8 +552,9 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-        block_quant_shape = get_weight_block_size_safety(config)
-    elif config.architectures[0] == "Qwen2MoeForCausalLM":
+    elif config.architectures[0] in [
+            "Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"
+    ]:
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
@ -565,6 +572,7 @@ def main(args: argparse.Namespace):
    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    block_quant_shape = get_weight_block_size_safety(config)

    if args.batch_size is None:
        batch_sizes = [
@ -576,6 +584,15 @@ def main(args: argparse.Namespace):

    use_deep_gemm = bool(args.use_deep_gemm)

+    if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ:
+        # Ray will set ROCR_VISIBLE_DEVICES for device visibility
+        logger.warning(
+            "Ray uses ROCR_VISIBLE_DEVICES to control device accessibility."
+            "Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES.")
+        val = os.environ["HIP_VISIBLE_DEVICES"]
+        os.environ["ROCR_VISIBLE_DEVICES"] = val
+        del os.environ["HIP_VISIBLE_DEVICES"]
+
    ray.init()
    num_gpus = int(ray.available_resources()["GPU"])
    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -0,0 +1,349 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+from typing import Any, TypedDict
+
+import ray
+import torch
+from transformers import AutoConfig
+
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _moe_permute, _moe_unpermute_and_reduce)
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
+from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+
+def benchmark_permute(num_tokens: int,
+                      num_experts: int,
+                      hidden_size: int,
+                      topk: int,
+                      dtype: torch.dtype,
+                      use_fp8_w8a8: bool,
+                      use_int8_w8a16: bool,
+                      num_iters: int = 100,
+                      use_customized_permute: bool = False) -> float:
+    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    # output_hidden_states = torch.empty_like(hidden_states)
+    if use_fp8_w8a8:
+        align_block_size = 128  # deepgemm needs 128 m aligned block
+        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
+    else:
+        align_block_size = None
+        qhidden_states = hidden_states
+
+    gating_output = torch.randn(num_iters,
+                                num_tokens,
+                                num_experts,
+                                dtype=torch.float32)
+
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        qhidden_states, input_gating, topk, False)
+
+    def prepare(i: int):
+        input_gating.copy_(gating_output[i])
+
+    def run():
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx,
+             m_indices) = moe_permute(
+                 qhidden_states,
+                 topk_weights=topk_weights,
+                 topk_ids=topk_ids,
+                 token_expert_indices=token_expert_indices,
+                 topk=topk,
+                 n_expert=num_experts,
+                 n_local_expert=num_experts,
+                 expert_map=None,
+                 align_block_size=align_block_size,
+             )
+        else:
+            (permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+             inv_perm) = _moe_permute(qhidden_states, None, topk_ids,
+                                      num_experts, None, align_block_size)
+
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run()
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+
+
+def benchmark_unpermute(num_tokens: int,
+                        num_experts: int,
+                        hidden_size: int,
+                        topk: int,
+                        dtype: torch.dtype,
+                        use_fp8_w8a8: bool,
+                        use_int8_w8a16: bool,
+                        num_iters: int = 100,
+                        use_customized_permute: bool = False) -> float:
+    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    output_hidden_states = torch.empty_like(hidden_states)
+    if use_fp8_w8a8:
+        align_block_size = 128  # deepgemm needs 128 m aligned block
+        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
+    else:
+        align_block_size = None
+        qhidden_states = hidden_states
+
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        qhidden_states, input_gating, topk, False)
+
+    def prepare():
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx,
+             m_indices) = moe_permute(
+                 qhidden_states,
+                 topk_weights=topk_weights,
+                 topk_ids=topk_ids,
+                 token_expert_indices=token_expert_indices,
+                 topk=topk,
+                 n_expert=num_experts,
+                 n_local_expert=num_experts,
+                 expert_map=None,
+                 align_block_size=align_block_size,
+             )
+            # convert to fp16/bf16 as gemm output
+            return (permuted_hidden_states.to(dtype), first_token_off,
+                    inv_perm_idx, m_indices)
+        else:
+            (permuted_qhidden_states, a1q_scale, sorted_token_ids, expert_ids,
+             inv_perm) = _moe_permute(qhidden_states, None, topk_ids,
+                                      num_experts, None, align_block_size)
+            # convert to fp16/bf16 as gemm output
+            return (permuted_qhidden_states.to(dtype), a1q_scale,
+                    sorted_token_ids, expert_ids, inv_perm)
+
+    def run(input: tuple):
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx,
+             m_indices) = input
+            moe_unpermute(permuted_hidden_states, topk_weights, topk_ids,
+                          inv_perm_idx, first_token_off, topk, num_experts,
+                          num_experts)
+        else:
+            (permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+             inv_perm) = input
+            _moe_unpermute_and_reduce(output_hidden_states,
+                                      permuted_hidden_states, inv_perm,
+                                      topk_weights)
+
+    # JIT compilation & warmup
+    input = prepare()
+    run(input)
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run(input)
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        torch.cuda.synchronize()
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+
+
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+
+    def __init__(self, seed: int) -> None:
+        torch.set_default_device("cuda")
+        current_platform.seed_everything(seed)
+        self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU. This is required for Ray to work
+        # correctly with multi-GPU tuning on the ROCm platform.
+        self.device_id = int(ray.get_gpu_ids()[0])
+
+    def benchmark(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        use_customized_permute: bool = False,
+    ) -> tuple[dict[str, int], float]:
+        current_platform.seed_everything(self.seed)
+
+        permute_time = benchmark_permute(
+            num_tokens,
+            num_experts,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            num_iters=100,
+            use_customized_permute=use_customized_permute)
+        unpermute_time = benchmark_unpermute(
+            num_tokens,
+            num_experts,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            num_iters=100,
+            use_customized_permute=use_customized_permute)
+        return permute_time, unpermute_time
+
+
+def get_weight_block_size_safety(config, default_value=None):
+
+    quantization_config = getattr(config, 'quantization_config', {})
+    if isinstance(quantization_config, dict):
+        return quantization_config.get('weight_block_size', default_value)
+    return default_value
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    config = AutoConfig.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code)
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+    elif (config.architectures[0] == "DeepseekV3ForCausalLM"
+          or config.architectures[0] == "DeepseekV2ForCausalLM"):
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+    elif config.architectures[0] in [
+            "Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"
+    ]:
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+
+    else:
+        # Support for llama4
+        config = config.get_text_config()
+        # Default: Mixtral.
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+
+    hidden_size = config.hidden_size
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
+    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    use_customized_permute = args.use_customized_permute
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+
+    ray.init()
+    num_gpus = int(ray.available_resources()["GPU"])
+    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+
+    def _distribute(method: str, inputs: list[Any]) -> list[Any]:
+        outputs = []
+        worker_idx = 0
+        for input_args in inputs:
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, method)
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        return ray.get(outputs)
+
+    outputs = _distribute(
+        "benchmark", [(batch_size, E, hidden_size, topk, dtype, use_fp8_w8a8,
+                       use_int8_w8a16, use_customized_permute)
+                      for batch_size in batch_sizes])
+
+    for batch_size, (permute, unpermute) in zip(batch_sizes, outputs):
+        print(f"Batch size: {batch_size}")
+        print(f"Permute time: {permute:.2f} us")
+        print(f"Unpermute time: {unpermute:.2f} us")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument("--model",
+                        type=str,
+                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
+                        default="auto")
+    parser.add_argument("--use-customized-permute", action="store_true")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    args = parser.parse_args()
+
+    main(args)
--- a/csrc/core/math.hpp
+++ b/csrc/core/math.hpp
@ -7,3 +7,22 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
  if (num <= 1) return num;
  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
+
+template <typename A, typename B>
+static inline constexpr auto div_ceil(A a, B b) {
+  return (a + b - 1) / b;
+}
+
+// Round a down to the next multiple of b. The caller is responsible for making
+// sure that b is non-zero
+template <typename T>
+inline constexpr T round_to_previous_multiple_of(T a, T b) {
+  return a % b == 0 ? a : (a / b) * b;
+}
+
+// Round a up to the next multiple of b. The caller is responsible for making
+// sure that b is non-zero
+template <typename T>
+inline constexpr T round_to_next_multiple_of(T a, T b) {
+  return a % b == 0 ? a : ((a / b) + 1) * b;
+}
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);

  const int SUB = 0x64006400;
  const int MUL = 0x2c002c00;
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@ -209,8 +209,8 @@ __device__ inline typename ScalarType<half>::FragB dequant<half, 4>(
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
@ -233,9 +233,9 @@ dequant<nv_bfloat16, 4>(int q,

  // Guarantee that the `(a & b) | c` operations are LOP3s.

-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
  q >>= 4;
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);

  static constexpr uint32_t MUL = 0x3F803F80;
  static constexpr uint32_t ADD = 0xC308C308;
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@ -0,0 +1,133 @@
+#include <c10/core/ScalarType.h>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "permute_unpermute_kernels/moe_permute_unpermute_kernel.h"
+#include "permute_unpermute_kernels/dispatch.h"
+#include "core/registration.h"
+
+void moe_permute(
+    const torch::Tensor& input,                      // [n_token, hidden]
+    const torch::Tensor& topk_weights,               //[n_token, topk]
+    torch::Tensor& topk_ids,                         // [n_token, topk]
+    const torch::Tensor& token_expert_indicies,      // [n_token, topk]
+    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
+    int64_t n_expert, int64_t n_local_expert, int64_t topk,
+    const std::optional<int64_t>& align_block_size,
+    torch::Tensor&
+        permuted_input,  // [topk * n_token/align_block_size_m, hidden]
+    torch::Tensor& expert_first_token_offset,  // [n_local_expert + 1]
+    torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    torch::Tensor& m_indices) {                // [align_expand_m]
+  TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
+              "topk_weights must be float32");
+  TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
+              "expert_first_token_offset must be int64");
+  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
+              "topk_ids must be int32");
+  TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
+              "token_expert_indicies must be int32");
+  TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
+              "src_row_id2dst_row_id_map must be int32");
+  TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
+              "expert_first_token_offset shape != n_local_expert+1")
+  TORCH_CHECK(
+      src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
+      "token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
+  auto n_token = input.sizes()[0];
+  auto n_hidden = input.sizes()[1];
+  auto align_block_size_value =
+      align_block_size.has_value() ? align_block_size.value() : -1;
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  const long sorter_size =
+      CubKeyValueSorter::getWorkspaceSize(n_token * topk, n_expert);
+  auto sort_workspace = torch::empty(
+      {sorter_size},
+      torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+  auto permuted_experts_id = torch::empty_like(topk_ids);
+  auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
+  auto align_expert_first_token_offset =
+      torch::zeros_like(expert_first_token_offset);
+
+  CubKeyValueSorter sorter{};
+  int64_t* valid_num_ptr = nullptr;
+  // pre-process kernel for expert-parallelism:
+  // no local expert id plus "n_expert" offset for priority to local expert
+  // map local expert id [n, .., n+n_local_expert-1] to [0, n_local_expert -1]
+  // For example, 4 expert with ep_size=2. ep_rank=1 owns global expert id
+  // [2,3] with expert_map[-1, -1, 0, 1], preprocess_topk_id  process topk_ids
+  // and map global expert id [2, 3] to local_expert id [0, 1] and map global
+  // expert id [0, 1] ( not in ep rank=1)  to [4, 5] by plus n_expert. This map
+  // operation is to make local expert high priority in following sort topk_ids
+  // and scan local expert_first_token_offset for each ep rank for next group
+  // gemm.
+  if (expert_map.has_value()) {
+    const int* expert_map_ptr = get_ptr<int>(expert_map.value());
+    valid_num_ptr =
+        get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
+    preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
+                             expert_map_ptr, n_expert, stream);
+  }
+  // expert sort topk expert id and scan expert id get expert_first_token_offset
+  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
+                    get_ptr<int>(permuted_experts_id),
+                    get_ptr<int>(dst_row_id2src_row_id_map),
+                    get_ptr<int64_t>(expert_first_token_offset), n_token,
+                    n_expert, n_local_expert, topk, sorter,
+                    get_ptr<int>(sort_workspace), stream);
+
+  // dispatch expandInputRowsKernelLauncher
+  MOE_DISPATCH(input.scalar_type(), [&] {
+    expandInputRowsKernelLauncher<scalar_t>(
+        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
+        get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
+        get_ptr<int>(dst_row_id2src_row_id_map),
+        get_ptr<int>(src_row_id2dst_row_id_map),
+        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
+        n_hidden, topk, n_local_expert, align_block_size_value, stream);
+  });
+
+  // get m_indices and update expert_first_token_offset with align block
+  getMIndices(get_ptr<int64_t>(expert_first_token_offset),
+              get_ptr<int64_t>(align_expert_first_token_offset),
+              get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
+              stream);
+  if (align_block_size.has_value()) {
+    // update align_expert_first_token_offset
+    expert_first_token_offset.copy_(align_expert_first_token_offset);
+  }
+}
+
+void moe_unpermute(
+    const torch::Tensor& permuted_hidden_states,     // [n_token * topk, hidden]
+    const torch::Tensor& topk_weights,               //[n_token, topk]
+    const torch::Tensor& topk_ids,                   // [n_token, topk]
+    const torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    const torch::Tensor& expert_first_token_offset,  // [n_local_expert+1]
+    int64_t n_expert, int64_t n_local_expert, int64_t topk,
+    torch::Tensor& hidden_states  // [n_token, hidden]
+) {
+  TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
+              "topk_ids shape must be same as src_row_id2dst_row_id_map");
+  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
+              "topk_ids must be int32");
+  TORCH_CHECK(
+      permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
+      "topk_ids dtype must be same as src_row_id2dst_row_id_map");
+  auto n_token = hidden_states.size(0);
+  auto n_hidden = hidden_states.size(1);
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  const int64_t* valid_ptr =
+      get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
+  MOE_DISPATCH(hidden_states.scalar_type(), [&] {
+    finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
+        get_ptr<scalar_t>(permuted_hidden_states),
+        get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
+        get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
+        n_token, n_hidden, topk, valid_ptr, stream);
+  });
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("moe_permute", &moe_permute);
+  m.impl("moe_unpermute", &moe_unpermute);
+}
--- a/csrc/moe/moe_wna16_utils.h
+++ b/csrc/moe/moe_wna16_utils.h
@ -108,11 +108,11 @@ __device__ inline void dequant<half2, 4>(int q, half2* res) {
  const int MUL = 0x2c002c00;
  const int ADD = 0xd400d400;

-  int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
  q >>= 8;
-  int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);

  res[0] = __hsub2(*reinterpret_cast<half2*>(&lo0),
                   *reinterpret_cast<const half2*>(&SUB));
@ -149,13 +149,13 @@ __device__ inline void dequant<nv_bfloat162, 4>(int q, nv_bfloat162* res) {
  static constexpr uint32_t MASK = 0x000f000f;
  static constexpr uint32_t EX = 0x43004300;

-  int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
  q >>= 4;
-  int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
  q >>= 4;
-  int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
  q >>= 4;
-  int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);

  static constexpr uint32_t MUL = 0x3F803F80;
  static constexpr uint32_t ADD = 0xC300C300;
--- a/csrc/moe/permute_unpermute_kernels/dispatch.h
+++ b/csrc/moe/permute_unpermute_kernels/dispatch.h
@ -0,0 +1,53 @@
+#pragma once
+#include <cuda_fp8.h>
+#define MOE_SWITCH(TYPE, ...)                                     \
+  at::ScalarType _st = ::detail::scalar_type(TYPE);               \
+  switch (_st) {                                                  \
+    __VA_ARGS__                                                   \
+    default:                                                      \
+      TORCH_CHECK(false, "[moe permute]data type dispatch fail!") \
+  }
+
+#define MOE_DISPATCH_CASE(enum_type, ...)                  \
+  case enum_type: {                                        \
+    using scalar_t = ScalarType2CudaType<enum_type>::type; \
+    __VA_ARGS__();                                         \
+    break;                                                 \
+  }
+#define MOE_DISPATCH_FLOAT_CASE(...)                          \
+  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)       \
+  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)        \
+  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)    \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+
+#define MOE_DISPATCH(TYPE, ...) \
+  MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
+
+template <at::ScalarType type>
+struct ScalarType2CudaType;
+
+template <>
+struct ScalarType2CudaType<at::ScalarType::Float> {
+  using type = float;
+};
+template <>
+struct ScalarType2CudaType<at::ScalarType::Half> {
+  using type = half;
+};
+template <>
+struct ScalarType2CudaType<at::ScalarType::BFloat16> {
+  using type = __nv_bfloat16;
+};
+
+// #if __CUDA_ARCH__ >= 890
+// fp8
+template <>
+struct ScalarType2CudaType<at::ScalarType::Float8_e5m2> {
+  using type = __nv_fp8_e5m2;
+};
+template <>
+struct ScalarType2CudaType<at::ScalarType::Float8_e4m3fn> {
+  using type = __nv_fp8_e4m3;
+};
+// #endif
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@ -0,0 +1,229 @@
+
+#include "moe_permute_unpermute_kernel.h"
+
+// CubKeyValueSorter definition begin
+CubKeyValueSorter::CubKeyValueSorter()
+    : num_experts_(0), num_bits_(sizeof(int) * 8) {}
+
+int CubKeyValueSorter::expertsToBits(int num_experts) {
+  // Max value we represent is V = num_experts + (num_experts - 1) = 2 *
+  // num_experts - 1 The maximum number of bits is therefore floor(log2(V)) + 1
+  return static_cast<int>(log2(2 * num_experts - 1)) + 1;
+}
+
+CubKeyValueSorter::CubKeyValueSorter(int const num_experts)
+    : num_experts_(num_experts), num_bits_(expertsToBits(num_experts)) {}
+
+void CubKeyValueSorter::updateNumExperts(int const num_experts) {
+  num_experts_ = num_experts;
+  num_bits_ = expertsToBits(num_experts);
+}
+
+size_t CubKeyValueSorter::getWorkspaceSize(size_t const num_key_value_pairs,
+                                           int const num_experts) {
+  int num_bits = expertsToBits(num_experts);
+  size_t required_storage = 0;
+  int* null_int = nullptr;
+  cub::DeviceRadixSort::SortPairs(nullptr, required_storage, null_int, null_int,
+                                  null_int, null_int, num_key_value_pairs, 0,
+                                  num_bits);
+
+  //   when num_key_value_pairs, num_experts, num_bits, required_storage = 64,
+  //   4, 3, 0 The required_storage seems to vary between 0 and 1 for the same
+  //   inputs
+  if (required_storage == 0) {
+    required_storage = 1;
+  }
+  return required_storage;
+}
+
+void CubKeyValueSorter::run(void* workspace, size_t const workspace_size,
+                            int const* keys_in, int* keys_out,
+                            int const* values_in, int* values_out,
+                            size_t const num_key_value_pairs,
+                            cudaStream_t stream) {
+  size_t expected_ws_size = getWorkspaceSize(num_key_value_pairs, num_experts_);
+  size_t actual_ws_size = workspace_size;
+
+  TORCH_CHECK(expected_ws_size <= workspace_size,
+              "[CubKeyValueSorter::run] The allocated workspace is too small "
+              "to run this problem.");
+  cub::DeviceRadixSort::SortPairs(workspace, actual_ws_size, keys_in, keys_out,
+                                  values_in, values_out, num_key_value_pairs, 0,
+                                  num_bits_, stream);
+}
+// CubKeyValueSorter definition end
+
+static inline size_t pad_to_multiple_of_16(size_t const& input) {
+  static constexpr int ALIGNMENT = 16;
+  return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
+}
+template <class T>
+__device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices,
+                                                      int64_t const arr_length,
+                                                      T const target) {
+  int64_t low = 0, high = arr_length - 1, target_location = -1;
+  while (low <= high) {
+    int64_t mid = (low + high) / 2;
+
+    if (sorted_indices[mid] >= target) {
+      high = mid - 1;
+    } else {
+      low = mid + 1;
+      target_location = mid;
+    }
+  }
+  return target_location + 1;
+}
+
+// Calculates the start offset of the tokens for a given expert. The last
+// element is the total number of valid tokens
+__global__ void computeExpertFirstTokenOffsetKernel(
+    int const* sorted_experts, int64_t const sorted_experts_len,
+    int const num_experts, int64_t* expert_first_token_offset) {
+  // First, compute the global tid. We only need 1 thread per expert.
+  int const expert = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Note that expert goes [0, num_experts] (inclusive) because we want a count
+  // for the total number of active tokens at the end of the scan.
+  if (expert >= num_experts + 1) {
+    return;
+  }
+  expert_first_token_offset[expert] =
+      findTotalEltsLessThanTarget(sorted_experts, sorted_experts_len, expert);
+}
+
+void computeExpertFirstTokenOffset(int const* sorted_indices,
+                                   int const total_indices,
+                                   int const num_experts,
+                                   int64_t* expert_first_token_offset,
+                                   cudaStream_t stream) {
+  int const num_entries = num_experts + 1;
+  int const threads = std::min(1024, num_entries);
+  int const blocks = (num_entries + threads - 1) / threads;
+
+  computeExpertFirstTokenOffsetKernel<<<blocks, threads, 0, stream>>>(
+      sorted_indices, total_indices, num_experts, expert_first_token_offset);
+}
+
+void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
+                       int* permuted_experts, int* permuted_rows,
+                       int64_t* expert_first_token_offset, int num_rows,
+                       int num_experts, int num_experts_per_node, int k,
+                       CubKeyValueSorter& sorter, void* sorter_ws,
+                       cudaStream_t stream) {
+  int64_t const expanded_num_rows = static_cast<int64_t>(k) * num_rows;
+  // We need to use the full num_experts because that is the sentinel value used
+  // by topk for disabled experts
+  sorter.updateNumExperts(num_experts);
+  size_t const sorter_ws_size_bytes = pad_to_multiple_of_16(
+      sorter.getWorkspaceSize(expanded_num_rows, num_experts));
+  sorter.run((void*)sorter_ws, sorter_ws_size_bytes, expert_for_source_row,
+             permuted_experts, source_rows, permuted_rows, expanded_num_rows,
+             stream);
+  computeExpertFirstTokenOffset(permuted_experts, expanded_num_rows,
+                                num_experts_per_node, expert_first_token_offset,
+                                stream);
+}
+
+__global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
+                                       const int* expert_map_ptr,
+                                       int num_experts) {
+  auto tidx = threadIdx.x;
+  auto bidx = blockIdx.x;
+  auto lidx = tidx & 31;
+  auto widx = tidx >> 5;
+  auto warp_count = (blockDim.x + 31) >> 5;
+  auto offset = bidx * blockDim.x;
+  auto bound = min(offset + blockDim.x, size);
+  extern __shared__ int smem_expert_map[];
+  // store expert_map in smem
+  for (int i = tidx; i < num_experts; i += blockDim.x) {
+    smem_expert_map[i] = expert_map_ptr[i];
+  }
+  __syncthreads();
+
+  // query global expert id in expert map.
+  // if global expert id = -1 in exert map, plus n_expert
+  // else set global expert id = exert map[global expert id]
+  if (offset + tidx < bound) {
+    auto topk_id = topk_id_ptr[offset + tidx];
+    auto local_expert_idx = smem_expert_map[topk_id];
+    if (local_expert_idx == -1) {
+      topk_id += num_experts;
+    } else {
+      topk_id = local_expert_idx;
+    }
+    __syncwarp();
+    topk_id_ptr[offset + tidx] = topk_id;
+  }
+}
+void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
+                              const int* expert_map_ptr, int num_experts,
+                              cudaStream_t stream) {
+  int block = std::min(size, 1024);
+  int grid = (size + block - 1) / block;
+  int smem_size = (num_experts) * sizeof(int);
+  preprocessTopkIdKernel<<<grid, block, smem_size, stream>>>(
+      topk_id_ptr, size, expert_map_ptr, num_experts);
+}
+
+template <bool ALIGN_BLOCK_SIZE>
+__global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
+                                  int64_t* align_expert_first_token_offset,
+                                  int* m_indices, const int num_local_expert,
+                                  const int align_block_size) {
+  int eidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  extern __shared__ int64_t smem_expert_first_token_offset[];
+  for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
+    smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
+  }
+  __syncthreads();
+  auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
+  auto first_token_offset = smem_expert_first_token_offset[eidx];
+  int n_token_in_expert = last_token_offset - first_token_offset;
+
+  if constexpr (ALIGN_BLOCK_SIZE) {
+    n_token_in_expert = (n_token_in_expert + align_block_size - 1) /
+                        align_block_size * align_block_size;
+    // round up to ALIGN_BLOCK_SIZE
+    int64_t accumulate_align_offset = 0;
+    for (int i = 1; i <= eidx + 1; i++) {
+      int n_token = smem_expert_first_token_offset[i] -
+                    smem_expert_first_token_offset[i - 1];
+      accumulate_align_offset =
+          accumulate_align_offset + (n_token + align_block_size - 1) /
+                                        align_block_size * align_block_size;
+      if (i == eidx) {
+        first_token_offset = accumulate_align_offset;
+      }
+      // last block store align_expert_first_token_offset
+      if (eidx == num_local_expert - 1 && threadIdx.x == 0) {
+        align_expert_first_token_offset[i] = accumulate_align_offset;
+      }
+    }
+  }
+  for (int idx = tidx; idx < n_token_in_expert; idx += blockDim.x) {
+    // update m_indice with expert id
+    m_indices[first_token_offset + idx] = eidx;
+  }
+}
+
+void getMIndices(int64_t* expert_first_token_offset,
+                 int64_t* align_expert_first_token_offset, int* m_indices,
+                 int num_local_expert, const int align_block_size,
+                 cudaStream_t stream) {
+  int block = 256;
+  int grid = num_local_expert;
+  int smem_size = sizeof(int64_t) * (num_local_expert + 1);
+  if (align_block_size == -1) {
+    getMIndicesKernel<false><<<grid, block, smem_size, stream>>>(
+        expert_first_token_offset, align_expert_first_token_offset, m_indices,
+        num_local_expert, align_block_size);
+  } else {
+    getMIndicesKernel<true><<<grid, block, smem_size, stream>>>(
+        expert_first_token_offset, align_expert_first_token_offset, m_indices,
+        num_local_expert, align_block_size);
+  }
+}
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@ -0,0 +1,95 @@
+#pragma once
+// reference from tensorrt_llm moe kernel implementation archive in
+// https://github.com/BBuf/tensorrt-llm-moe/tree/master
+
+#include <c10/core/ScalarType.h>
+#include <torch/all.h>
+#include "dispatch.h"
+#include <cub/cub.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_type.cuh>
+#include "cutlass/numeric_size.h"
+#include "cutlass/array.h"
+
+template <typename T>
+inline T* get_ptr(torch::Tensor& t) {
+  return reinterpret_cast<T*>(t.data_ptr());
+}
+
+template <typename T>
+inline const T* get_ptr(const torch::Tensor& t) {
+  return reinterpret_cast<const T*>(t.data_ptr());
+}
+
+class CubKeyValueSorter {
+ public:
+  CubKeyValueSorter();
+
+  CubKeyValueSorter(int const num_experts);
+
+  void updateNumExperts(int const num_experts);
+
+  static size_t getWorkspaceSize(size_t const num_key_value_pairs,
+                                 int const num_experts);
+
+  void run(void* workspace, size_t const workspace_size, int const* keys_in,
+           int* keys_out, int const* values_in, int* values_out,
+           size_t const num_key_value_pairs, cudaStream_t stream);
+
+ private:
+  static int expertsToBits(int experts);
+  int num_experts_;
+  int num_bits_;
+};
+
+void computeExpertFirstTokenOffset(int const* sorted_indices,
+                                   int const total_indices,
+                                   int const num_experts,
+                                   int64_t* expert_first_token_offset,
+                                   cudaStream_t stream);
+
+void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
+                       int* permuted_experts, int* permuted_rows,
+                       int64_t* expert_first_token_offset, int num_rows,
+                       int num_experts, int num_experts_per_node, int k,
+                       CubKeyValueSorter& sorter, void* sorter_ws,
+                       cudaStream_t stream);
+
+template <typename T>
+void expandInputRowsKernelLauncher(
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
+    int* expanded_source_row_to_expanded_dest_row,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
+    int num_local_experts, const int& align_block_size, cudaStream_t stream);
+
+// Final kernel to unpermute and scale
+// This kernel unpermutes the original data, does the k-way reduction and
+// performs the final skip connection.
+template <typename T, typename OutputType, bool CHECK_SKIPPED>
+__global__ void finalizeMoeRoutingKernel(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
+    int64_t const* num_valid_ptr);
+
+template <class T, class OutputType>
+void finalizeMoeRoutingKernelLauncher(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
+    cudaStream_t stream);
+
+void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
+                              const int* expert_map_ptr, int num_experts,
+                              cudaStream_t stream);
+
+void getMIndices(int64_t* expert_first_token_offset,
+                 int64_t* align_expert_first_token_offset, int* m_indices,
+                 int num_local_expert, const int align_block_size,
+                 cudaStream_t stream);
+
+#include "moe_permute_unpermute_kernel.inl"
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@ -0,0 +1,211 @@
+#pragma once
+
+template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
+__global__ void expandInputRowsKernel(
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
+    int* expanded_source_row_to_expanded_dest_row,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* num_dest_rows, int64_t const cols, int64_t k,
+    int num_local_experts, int align_block_size) {
+  // Reverse permutation map.
+  // I do this so that later, we can use the source -> dest map to do the k-way
+  // reduction and unpermuting. I need the reverse map for that reduction to
+  // allow each threadblock to do 1 k-way reduce without atomics later in MoE. 1
+  // thread block will be responsible for all k summations.
+  int64_t expanded_dest_row = blockIdx.x;
+  int64_t const expanded_source_row =
+      expanded_dest_row_to_expanded_source_row[expanded_dest_row];
+  int expert_id = sorted_experts[expanded_dest_row];
+
+  extern __shared__ int64_t smem_expert_first_token_offset[];
+  int64_t align_expanded_row_accumulate = 0;
+  if constexpr (ALIGN_BLOCK_SIZE) {
+    // load g2s
+    for (int idx = threadIdx.x; idx < num_local_experts + 1;
+         idx += blockDim.x) {
+      smem_expert_first_token_offset[idx] =
+          __ldg(expert_first_token_offset + idx);
+    }
+    __syncthreads();
+    int lane_idx = threadIdx.x & 31;
+
+    if (lane_idx == 0) {
+      // set token_offset_in_expert = 0 if this expert is not local expert
+      int token_offset_in_expert =
+          expert_id >= num_local_experts
+              ? 0
+              : expanded_dest_row - smem_expert_first_token_offset[expert_id];
+      int64_t accumulate_align_offset = 0;
+#pragma unroll 1
+      for (int eidx = 1; eidx <= min(expert_id, num_local_experts); eidx++) {
+        auto n_token_in_expert = smem_expert_first_token_offset[eidx] -
+                                 smem_expert_first_token_offset[eidx - 1];
+        accumulate_align_offset += (n_token_in_expert + align_block_size - 1) /
+                                   align_block_size * align_block_size;
+      }
+      expanded_dest_row = accumulate_align_offset + token_offset_in_expert;
+    }
+    // lane0 shuffle broadcast align_expanded_dest_row
+    expanded_dest_row = __shfl_sync(0xffffffff, expanded_dest_row, 0);
+  }
+
+  if (threadIdx.x == 0) {
+    assert(expanded_dest_row <= INT32_MAX);
+    expanded_source_row_to_expanded_dest_row[expanded_source_row] =
+        static_cast<int>(expanded_dest_row);
+  }
+
+  if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
+    // Load 128-bits per thread
+    constexpr int64_t ELEM_PER_THREAD = 128 / cutlass::sizeof_bits<T>::value;
+    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
+
+    // Duplicate and permute rows
+    int64_t const source_k_rank = expanded_source_row / num_rows;
+    int64_t const source_row = expanded_source_row % num_rows;
+
+    auto const* source_row_ptr =
+        reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
+    auto* dest_row_ptr =
+        reinterpret_cast<DataElem*>(permuted_output + expanded_dest_row * cols);
+
+    int64_t const start_offset = threadIdx.x;
+    int64_t const stride = blockDim.x;
+    int64_t const num_elems_in_col = cols / ELEM_PER_THREAD;
+
+    for (int elem_index = start_offset; elem_index < num_elems_in_col;
+         elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
+template <typename T>
+void expandInputRowsKernelLauncher(
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
+    int* expanded_source_row_to_expanded_dest_row,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
+    int num_local_experts, const int& align_block_size, cudaStream_t stream) {
+  int64_t const blocks = num_rows * k;
+  int64_t const threads = 256;
+  using FuncPtr = decltype(&expandInputRowsKernel<T, true, true>);
+  FuncPtr func_map[2][2] = {
+      {&expandInputRowsKernel<T, false, false>,
+       &expandInputRowsKernel<T, false, true>},
+      {&expandInputRowsKernel<T, true, false>,
+       &expandInputRowsKernel<T, true, true>},
+  };
+  bool is_check_skip = num_valid_tokens_ptr != nullptr;
+  bool is_align_block_size = align_block_size != -1;
+  auto func = func_map[is_check_skip][is_align_block_size];
+
+  int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
+
+  func<<<blocks, threads, smem_size, stream>>>(
+      unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
+      expanded_dest_row_to_expanded_source_row,
+      expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
+      num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
+      align_block_size);
+}
+
+template <class T, class U>
+__host__ __device__ constexpr static U arrayConvert(T const& input) {
+  using Type = typename U::Element;
+  static_assert(T::kElements == U::kElements);
+  U u;
+#pragma unroll
+  for (int i = 0; i < U::kElements; i++) {
+    u[i] = static_cast<Type>(input[i]);
+  }
+  return u;
+}
+
+template <typename T, typename OutputType, bool CHECK_SKIPPED>
+__global__ void finalizeMoeRoutingKernel(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
+    int64_t const* num_valid_ptr) {
+  assert(orig_cols % 4 == 0);
+  int64_t const original_row = blockIdx.x;
+  int64_t const num_rows = gridDim.x;
+  auto const offset = original_row * orig_cols;
+  OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
+  int64_t const num_valid = *num_valid_ptr;
+
+  // Load 128-bits per thread, according to the smallest data type we read/write
+  constexpr int64_t FINALIZE_ELEM_PER_THREAD =
+      128 / std::min(cutlass::sizeof_bits<OutputType>::value,
+                     cutlass::sizeof_bits<T>::value);
+
+  int64_t const start_offset = threadIdx.x;
+  int64_t const stride = blockDim.x;
+  int64_t const num_elems_in_col = orig_cols / FINALIZE_ELEM_PER_THREAD;
+
+  using InputElem = cutlass::Array<T, FINALIZE_ELEM_PER_THREAD>;
+  using OutputElem = cutlass::Array<OutputType, FINALIZE_ELEM_PER_THREAD>;
+  using ComputeElem = cutlass::Array<float, FINALIZE_ELEM_PER_THREAD>;
+  auto const* expanded_permuted_rows_v =
+      reinterpret_cast<InputElem const*>(expanded_permuted_rows);
+  auto* reduced_row_ptr_v = reinterpret_cast<OutputElem*>(reduced_row_ptr);
+
+#pragma unroll
+  for (int elem_index = start_offset; elem_index < num_elems_in_col;
+       elem_index += stride) {
+    ComputeElem thread_output;
+    thread_output.fill(0);
+    float row_rescale{0.f};
+    for (int k_idx = 0; k_idx < k; ++k_idx) {
+      int64_t const expanded_original_row = original_row + k_idx * num_rows;
+      int64_t const expanded_permuted_row =
+          expanded_source_row_to_expanded_dest_row[expanded_original_row];
+
+      int64_t const k_offset = original_row * k + k_idx;
+      float const row_scale = scales[k_offset];
+
+      // Check after row_rescale has accumulated
+      if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
+        continue;
+      }
+
+      auto const* expanded_permuted_rows_row_ptr =
+          expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col;
+
+      int64_t const expert_idx = expert_for_source_row[k_offset];
+
+      ComputeElem expert_result = arrayConvert<InputElem, ComputeElem>(
+          expanded_permuted_rows_row_ptr[elem_index]);
+      thread_output = thread_output + row_scale * (expert_result);
+    }
+
+    OutputElem output_elem =
+        arrayConvert<ComputeElem, OutputElem>(thread_output);
+    reduced_row_ptr_v[elem_index] = output_elem;
+  }
+}
+
+template <class T, class OutputType>
+void finalizeMoeRoutingKernelLauncher(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
+    cudaStream_t stream) {
+  int64_t const blocks = num_rows;
+  int64_t const threads = 256;
+  bool const check_finished = num_valid_ptr != nullptr;
+  using FuncPtr = decltype(&finalizeMoeRoutingKernel<T, OutputType, false>);
+  FuncPtr func_map[2] = {&finalizeMoeRoutingKernel<T, OutputType, false>,
+                         &finalizeMoeRoutingKernel<T, OutputType, true>};
+  auto* const kernel = func_map[check_finished];
+  kernel<<<blocks, threads, 0, stream>>>(
+      expanded_permuted_rows, reduced_unpermuted_output, scales,
+      expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
+      num_valid_ptr);
+}
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -53,7 +53,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "int size_m, int size_n, int size_k,"
      "bool is_full_k, bool use_atomic_add,"
      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+  m.def(
+      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
+      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
+      "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
+      "int b_q_type, SymInt size_m, "
+      "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
+      "topk, "
+      "int moe_block_size, bool replicate_input, bool apply_weights)"
+      " -> Tensor");

+  m.def(
+      "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
+      "Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
+      "int n_local_expert,"
+      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
+      "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
+      "m_indices)->()");
+
+  m.def(
+      "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
+      "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
+      "expert_first_token_offset, int n_expert, int n_local_expert,int "
+      "topk, Tensor! hidden_states)->()");
  // conditionally compiled so impl registration is in source file

 #endif
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -97,6 +97,9 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,

 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);

+void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
+                        torch::Tensor& scale);
+
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -0,0 +1,120 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cmath>
+#include "core/math.hpp"
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+#include "quantization/fp8/common.cuh"
+
+namespace vllm {
+
+template <typename T>
+__device__ __forceinline__ T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+// Activation and gating kernel template.
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          typename fp8_type>
+__global__ void act_and_mul_quant_kernel(
+    fp8_type* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const float* scale, const int d) {
+  const int32_t blocks_per_token = gridDim.y;
+
+  const int32_t elems_per_128bit_load = (128 / 8) / sizeof(scalar_t);
+
+  // We don't expect the hidden dimension to exceed 32 bits so int32 should
+  // be safe here.
+  const int32_t tgt_elems_per_block = div_ceil(d, blocks_per_token);
+  const int32_t elems_per_block =
+      round_to_next_multiple_of(tgt_elems_per_block, elems_per_128bit_load);
+  const int32_t block_start = blockIdx.y * elems_per_block;
+  int32_t block_end = block_start + elems_per_block;
+  block_end = block_end > d ? d : block_end;
+
+  // token_idx is 64 bit to prevent 32 bit overflow when the number of tokens
+  // is very large
+  const int64_t token_idx = blockIdx.x;
+  const scalar_t* __restrict__ x_ptr = input + token_idx * 2 * d;
+  const scalar_t* __restrict__ y_ptr = input + token_idx * 2 * d + d;
+  fp8_type* __restrict__ out_ptr = out + token_idx * d;
+
+  // 128-bit vectorized code
+  const int32_t vec_loop_end =
+      round_to_previous_multiple_of(elems_per_128bit_load, block_end);
+  const int32_t vec_end_idx = vec_loop_end / elems_per_128bit_load;
+  const int32_t vec_start_idx = block_start / elems_per_128bit_load;
+
+  const int4* __restrict__ x_128bit_ptr = reinterpret_cast<const int4*>(x_ptr);
+  const int4* __restrict__ y_128bit_ptr = reinterpret_cast<const int4*>(y_ptr);
+  int2* __restrict__ out_128bit_ptr = reinterpret_cast<int2*>(out_ptr);
+
+  float inverted_scale = 1 / *scale;
+#pragma unroll
+  for (int32_t vec_idx = vec_start_idx + threadIdx.x; vec_idx < vec_end_idx;
+       vec_idx += blockDim.x) {
+    const int4 x_128bit = VLLM_LDG(&x_128bit_ptr[vec_idx]);
+    const int4 y_128bit = VLLM_LDG(&y_128bit_ptr[vec_idx]);
+    using scalar_128bit_vec_t = std::array<scalar_t, elems_per_128bit_load>;
+    using scalar_64bit_vec_t = std::array<fp8_type, elems_per_128bit_load>;
+
+    scalar_64bit_vec_t out_vec;
+    const auto x_vec = reinterpret_cast<scalar_128bit_vec_t const&>(x_128bit);
+    const auto y_vec = reinterpret_cast<scalar_128bit_vec_t const&>(y_128bit);
+
+#pragma unroll
+    for (int i = 0; i < elems_per_128bit_load; i++) {
+      out_vec[i] = scaled_fp8_conversion<true, fp8_type>(
+          ACT_FN(x_vec[i]) * y_vec[i], inverted_scale);
+    }
+
+    out_128bit_ptr[vec_idx] = reinterpret_cast<const int2&>(out_vec);
+  }
+
+  // Scalar cleanup code
+  if (block_end > vec_loop_end) {
+    for (int64_t idx = vec_loop_end + threadIdx.x; idx < block_end;
+         idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] =
+          scaled_fp8_conversion<true, fp8_type>(ACT_FN(x) * y, inverted_scale);
+    }
+  }
+}
+}  // namespace vllm
+
+// Launch activation, gating, and quantize kernel.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                               \
+  int d = input.size(-1) / 2;                                               \
+  int64_t num_tokens = input.numel() / input.size(-1);                      \
+  dim3 grid(num_tokens, num_tokens > 16 ? num_tokens > 32 ? 1 : 2 : 4);     \
+  dim3 block(std::min(d, 512));                                             \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));         \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();             \
+  VLLM_DISPATCH_FLOATING_TYPES(                                             \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                      \
+        VLLM_DISPATCH_FP8_TYPES(                                            \
+            out.scalar_type(), "fused_add_rms_norm_kernel_fp8_type", [&] {  \
+              vllm::act_and_mul_quant_kernel<scalar_t, KERNEL<scalar_t>,    \
+                                             fp8_t>                         \
+                  <<<grid, block, 0, stream>>>(out.data_ptr<fp8_t>(),       \
+                                               input.data_ptr<scalar_t>(),  \
+                                               scale.data_ptr<float>(), d); \
+            });                                                             \
+      });
+
+void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
+                        torch::Tensor& input,  // [..., 2 * d]
+                        torch::Tensor& scale) {
+  TORCH_CHECK(out.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
+              input.dtype() == torch::kBFloat16);
+  TORCH_CHECK(input.size(-1) % 2 == 0);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+}
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@ -96,7 +96,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
    std::optional<at::Tensor> const& scale_ub,
    std::optional<at::Tensor>& residual) {
  int32_t hidden_size = input.size(-1);
-  int32_t num_tokens = input.numel() / hidden_size;
+  auto num_tokens = input.numel() / hidden_size;

  dim3 grid(num_tokens);
  dim3 block(std::min(hidden_size, 1024));
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@ -347,7 +347,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
      for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
        hmma16816_f32<FType>(
            C_frag[m_idx][n_idx], A_frag[reg_buf_idx][m_idx],
-            reinterpret_cast<uint32_t(&)[2]>(BF_frag[reg_buf_idx][n_idx]));
+            reinterpret_cast<uint32_t (&)[2]>(BF_frag[reg_buf_idx][n_idx]));
      }
    }
  }
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {

  // Guarantee that the `(a & b) | c` operations are LOP3s.

-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
  q >>= 4;
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);

  typename ScalarType<nv_bfloat16>::FragB frag_b;
  static constexpr uint32_t MUL = 0x3F803F80;
@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);

  const int SUB = 0x64006400;
  const int MUL = 0x2c002c00;
@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {

  // Guarantee that the `(a & b) | c` operations are LOP3s.

-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
  q >>= 4;
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);

  typename ScalarType<nv_bfloat16>::FragB frag_b;
  static constexpr uint32_t MUL = 0x3F803F80;
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
  static constexpr uint32_t HI = 0x00f000f0;
  static constexpr uint32_t EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  static constexpr uint32_t SUB = 0x64086408;
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -25,8 +25,9 @@
 #include "../attention/dtype_fp8.cuh"
 #include "../quantization/fp8/amd/quant_utils.cuh"

-#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
-  #define __HIP__MI300_MI250__
+#if defined(__HIPCC__) && \
+    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
+  #define __HIP__GFX9__
 #endif

 #if defined(NDEBUG)
@ -42,7 +43,7 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))

-#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support

  #define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32
  #define GCN_MFMA_INSTR __builtin_amdgcn_mfma_f32_4x4x4f16
@ -1479,7 +1480,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
  }
 }

-#else  // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#else  // !defined(__HIP__GFX9__) TODO: Add NAVI support

 // clang-format off
 template <typename scalar_t, typename cache_t,
@ -1552,7 +1553,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 }
 // clang-format on

-#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support

 #define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -81,9 +81,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // Activation ops
  // Activation function used in SwiGLU.
-  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.def("silu_and_mul(Tensor! result, Tensor input) -> ()");
  ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);

+  ops.def(
+      "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
+  ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
+
  ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
  ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);

@ -130,13 +134,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      ") -> ()");
  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);

-  // Compute MLA decode using cutlass.
-  ops.def(
-      "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
-      "                   Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
-      "                   Tensor page_table, float scale) -> ()");
-  ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
-
  // Layernorm
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
@ -450,6 +447,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]");
  ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);

+  // CUTLASS MLA decode
+  ops.def(
+      "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
+      "                   Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
+      "                   Tensor page_table, float scale) -> ()");
+  ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
+
  // Mamba selective scan kernel
  ops.def(
      "selective_scan_fwd(Tensor! u, Tensor! delta,"
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -5,11 +5,11 @@
 # docs/source/contributing/dockerfile/dockerfile.md and
 # docs/source/assets/contributing/dockerfile-stages-dependency.png

-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.8.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.8.1
 ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
@ -19,7 +19,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo \
-    && add-apt-repository ppa:deadsnakes/ppa \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
@ -34,6 +37,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"

 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
@ -66,7 +70,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/cuda.txt
+    uv pip install --system -r requirements/cuda.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -89,9 +94,11 @@ COPY requirements/build.txt requirements/build.txt
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 COPY . .
 ARG GIT_REPO_CHECK=0
@ -158,22 +165,25 @@ FROM base as dev
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Workaround for #17068
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"

 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
-# Workaround for #17068
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################

 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 # TODO: Restore to base image after FlashInfer AOT wheel fixed
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.8.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
@ -188,7 +198,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && add-apt-repository ppa:deadsnakes/ppa \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
@ -203,6 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -223,7 +237,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose
+    uv pip install --system dist/*.whl --verbose \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@ -240,19 +255,26 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    # TESTING: install FlashInfer from source to test 2.7.0 final RC
+    FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \
+    uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \
 fi
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .

+RUN --mount=type=cache,target=/root/.cache/uv \
+. /etc/environment && \
+uv pip list
+
 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
 # install build dependencies for JIT compilation.
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 #################### vLLM installation IMAGE ####################

@ -266,11 +288,13 @@ ADD . /vllm-workspace/
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"

-# install development dependencies (for testing)
 # Workaround for #17068
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
+
+# install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/dev.txt

--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@ -16,7 +16,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo \
-    && add-apt-repository ppa:deadsnakes/ppa \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
@ -197,7 +200,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && add-apt-repository ppa:deadsnakes/ppa \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -114,8 +114,16 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false

+# ENV that can improve safe tensor loading, and end-to-end time
+ENV SAFETENSORS_FAST_GPU=1
+
+# User-friendly environment setting for multi-processing to avoid below RuntimeError.
+# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
+# you must use the 'spawn' start method 
+# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1

 CMD ["/bin/bash"]
-
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -32,7 +32,10 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
 RUN apt-get update -y \
    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
-    && add-apt-repository ppa:deadsnakes/ppa \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
 RUN microdnf install -y \
    which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
-    openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \
    microdnf clean all

 # Python Installation
@ -123,6 +123,7 @@ ENV UV_LINK_MODE=copy
 ENV CARGO_HOME=/root/.cargo
 ENV RUSTUP_HOME=/root/.rustup
 ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1

 COPY . /workspace/vllm
 WORKDIR /workspace/vllm
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
    python3 -m pip install \
        -r requirements/tpu.txt
-RUN python3 setup.py develop
+RUN python3 -m pip install -e .

 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@ -40,12 +40,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
    python3 setup.py install

-# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
-# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-extension-for-pytorch==2.6.10+xpu \
-    --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-
 CMD ["/bin/bash"]

 FROM vllm-base AS vllm-openai
--- a/docs/Makefile
+++ b/docs/Makefile
@ -22,3 +22,4 @@ help:
 clean:
 	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 	rm -rf "$(SOURCEDIR)/getting_started/examples"
+	rm -rf "$(SOURCEDIR)/api/vllm"
--- a/docs/source/api/engine/async_llm_engine.md
+++ b/docs/source/api/engine/async_llm_engine.md
@ -1,7 +0,0 @@
-# AsyncLLMEngine
-
-```{eval-rst}
-.. autoclass:: vllm.AsyncLLMEngine
-    :members:
-    :show-inheritance:
-```
--- a/docs/source/api/engine/index.md
+++ b/docs/source/api/engine/index.md
@ -1,17 +0,0 @@
-# vLLM Engine
-
-```{eval-rst}
-.. automodule:: vllm.engine
-```
-
-```{eval-rst}
-.. currentmodule:: vllm.engine
-```
-
-:::{toctree}
-:caption: Engines
-:maxdepth: 2
-
-llm_engine
-async_llm_engine
-:::
--- a/docs/source/api/engine/llm_engine.md
+++ b/docs/source/api/engine/llm_engine.md
@ -1,7 +0,0 @@
-# LLMEngine
-
-```{eval-rst}
-.. autoclass:: vllm.LLMEngine
-    :members:
-    :show-inheritance:
-```
--- a/docs/source/api/inference_params.md
+++ b/docs/source/api/inference_params.md
@ -1,21 +0,0 @@
-# Inference Parameters
-
-Inference parameters for vLLM APIs.
-
-(sampling-params)=
-
-## Sampling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.SamplingParams
-    :members:
-```
-
-(pooling-params)=
-
-## Pooling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.PoolingParams
-    :members:
-```
--- a/docs/source/api/model/adapters.md
+++ b/docs/source/api/model/adapters.md
@ -1,9 +0,0 @@
-# Model Adapters
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.model_executor.models.adapters
-    :members:
-    :member-order: bysource
-```
--- a/docs/source/api/model/index.md
+++ b/docs/source/api/model/index.md
@ -1,11 +0,0 @@
-# Model Development
-
-## Submodules
-
-:::{toctree}
-:maxdepth: 1
-
-interfaces_base
-interfaces
-adapters
-:::
--- a/docs/source/api/model/interfaces.md
+++ b/docs/source/api/model/interfaces.md
@ -1,9 +0,0 @@
-# Optional Interfaces
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.model_executor.models.interfaces
-    :members:
-    :member-order: bysource
-```
--- a/docs/source/api/model/interfaces_base.md
+++ b/docs/source/api/model/interfaces_base.md
@ -1,9 +0,0 @@
-# Base Model Interfaces
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.model_executor.models.interfaces_base
-    :members:
-    :member-order: bysource
-```
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@ -1,28 +0,0 @@
-(multi-modality)=
-
-# Multi-Modality
-
-vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
-
-Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
-via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
-
-Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
-
-## Module Contents
-
-```{eval-rst}
-.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
-```
-
-## Submodules
-
-:::{toctree}
-:maxdepth: 1
-
-inputs
-parse
-processing
-profiling
-registry
-:::
--- a/docs/source/api/multimodal/inputs.md
+++ b/docs/source/api/multimodal/inputs.md
@ -1,49 +0,0 @@
-# Input Definitions
-
-## User-facing inputs
-
-```{eval-rst}
-.. autodata:: vllm.multimodal.inputs.MultiModalDataDict
-```
-
-## Internal data structures
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
-    :members:
-    :show-inheritance:
-```
-
-```{eval-rst}
-.. autodata:: vllm.multimodal.inputs.NestedTensors
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
-    :members:
-    :show-inheritance:
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
-    :members:
-    :show-inheritance:
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
-    :members:
-    :show-inheritance:
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
-    :members:
-    :show-inheritance:
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalInputs
-    :members:
-    :show-inheritance:
-```
--- a/docs/source/api/multimodal/parse.md
+++ b/docs/source/api/multimodal/parse.md
@ -1,9 +0,0 @@
-# Data Parsing
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.parse
-    :members:
-    :member-order: bysource
-```
--- a/docs/source/api/multimodal/processing.md
+++ b/docs/source/api/multimodal/processing.md
@ -1,9 +0,0 @@
-# Data Processing
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.processing
-    :members:
-    :member-order: bysource
-```
--- a/docs/source/api/multimodal/profiling.md
+++ b/docs/source/api/multimodal/profiling.md
@ -1,9 +0,0 @@
-# Memory Profiling
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.profiling
-    :members:
-    :member-order: bysource
-```
--- a/docs/source/api/multimodal/registry.md
+++ b/docs/source/api/multimodal/registry.md
@ -1,9 +0,0 @@
-# Registry
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.registry
-    :members:
-    :member-order: bysource
-```
--- a/docs/source/api/offline_inference/index.md
+++ b/docs/source/api/offline_inference/index.md
@ -1,9 +0,0 @@
-# Offline Inference
-
-:::{toctree}
-:caption: Contents
-:maxdepth: 1
-
-llm
-llm_inputs
-:::
--- a/docs/source/api/offline_inference/llm.md
+++ b/docs/source/api/offline_inference/llm.md
@ -1,7 +0,0 @@
-# LLM Class
-
-```{eval-rst}
-.. autoclass:: vllm.LLM
-    :members:
-    :show-inheritance:
-```
--- a/docs/source/api/offline_inference/llm_inputs.md
+++ b/docs/source/api/offline_inference/llm_inputs.md
@ -1,19 +0,0 @@
-# LLM Inputs
-
-```{eval-rst}
-.. autodata:: vllm.inputs.PromptType
-```
-
-```{eval-rst}
-.. autoclass:: vllm.inputs.TextPrompt
-    :show-inheritance:
-    :members:
-    :member-order: bysource
-```
-
-```{eval-rst}
-.. autoclass:: vllm.inputs.TokensPrompt
-    :show-inheritance:
-    :members:
-    :member-order: bysource
-```
--- a/docs/source/api/summary.md
+++ b/docs/source/api/summary.md
@ -0,0 +1,133 @@
+# Summary
+
+(configuration)=
+
+## Configuration
+
+API documentation for vLLM's configuration classes.
+
+```{autodoc2-summary}
+    vllm.config.ModelConfig
+    vllm.config.CacheConfig
+    vllm.config.TokenizerPoolConfig
+    vllm.config.LoadConfig
+    vllm.config.ParallelConfig
+    vllm.config.SchedulerConfig
+    vllm.config.DeviceConfig
+    vllm.config.SpeculativeConfig
+    vllm.config.LoRAConfig
+    vllm.config.PromptAdapterConfig
+    vllm.config.MultiModalConfig
+    vllm.config.PoolerConfig
+    vllm.config.DecodingConfig
+    vllm.config.ObservabilityConfig
+    vllm.config.KVTransferConfig
+    vllm.config.CompilationConfig
+    vllm.config.VllmConfig
+```
+
+(offline-inference-api)=
+
+## Offline Inference
+
+LLM Class.
+
+```{autodoc2-summary}
+    vllm.LLM
+```
+
+LLM Inputs.
+
+```{autodoc2-summary}
+    vllm.inputs.PromptType
+    vllm.inputs.TextPrompt
+    vllm.inputs.TokensPrompt
+```
+
+## vLLM Engines
+
+Engine classes for offline and online inference.
+
+```{autodoc2-summary}
+    vllm.LLMEngine
+    vllm.AsyncLLMEngine
+```
+
+## Inference Parameters
+
+Inference parameters for vLLM APIs.
+
+(sampling-params)=
+(pooling-params)=
+
+```{autodoc2-summary}
+    vllm.SamplingParams
+    vllm.PoolingParams
+```
+
+(multi-modality)=
+
+## Multi-Modality
+
+vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
+
+Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
+via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
+
+Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
+
+```{autodoc2-summary}
+    vllm.multimodal.MULTIMODAL_REGISTRY
+```
+
+### Inputs
+
+User-facing inputs.
+
+```{autodoc2-summary}
+    vllm.multimodal.inputs.MultiModalDataDict
+```
+
+Internal data structures.
+
+```{autodoc2-summary}
+    vllm.multimodal.inputs.PlaceholderRange
+    vllm.multimodal.inputs.NestedTensors
+    vllm.multimodal.inputs.MultiModalFieldElem
+    vllm.multimodal.inputs.MultiModalFieldConfig
+    vllm.multimodal.inputs.MultiModalKwargsItem
+    vllm.multimodal.inputs.MultiModalKwargs
+    vllm.multimodal.inputs.MultiModalInputs
+```
+
+### Data Parsing
+
+```{autodoc2-summary}
+    vllm.multimodal.parse
+```
+
+### Data Processing
+
+```{autodoc2-summary}
+    vllm.multimodal.processing
+```
+
+### Memory Profiling
+
+```{autodoc2-summary}
+    vllm.multimodal.profiling
+```
+
+### Registry
+
+```{autodoc2-summary}
+    vllm.multimodal.registry
+```
+
+## Model Development
+
+```{autodoc2-summary}
+    vllm.model_executor.models.interfaces_base
+    vllm.model_executor.models.interfaces
+    vllm.model_executor.models.adapters
+```
--- a/docs/source/assets/deployment/streamlit-chat.png
+++ b/docs/source/assets/deployment/streamlit-chat.png
--- a/docs/source/autodoc2_docstring_parser.py
+++ b/docs/source/autodoc2_docstring_parser.py
@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+from docutils import nodes
+from myst_parser.parsers.sphinx_ import MystParser
+from sphinx.ext.napoleon import docstring
+
+
+class NapoleonParser(MystParser):
+
+    def parse(self, input_string: str, document: nodes.document) -> None:
+        # Get the Sphinx configuration
+        config = document.settings.env.config
+
+        parsed_content = str(
+            docstring.GoogleDocstring(
+                str(docstring.NumpyDocstring(input_string, config)),
+                config,
+            ))
+        return super().parse(parsed_content, document)
+
+
+Parser = NapoleonParser
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -13,16 +13,17 @@
 # documentation root, use os.path.abspath to make it absolute, like shown here.

 import datetime
-import inspect
 import logging
 import os
+import re
 import sys
+from pathlib import Path

 import requests
-from sphinx.ext import autodoc

 logger = logging.getLogger(__name__)
-sys.path.append(os.path.abspath("../.."))
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.append(os.path.abspath(REPO_ROOT))

 # -- Project information -----------------------------------------------------

@ -40,8 +41,7 @@ extensions = [
    "sphinx.ext.linkcode",
    "sphinx.ext.intersphinx",
    "sphinx_copybutton",
-    "sphinx.ext.autodoc",
-    "sphinx.ext.autosummary",
+    "autodoc2",
    "myst_parser",
    "sphinxarg.ext",
    "sphinx_design",
@ -49,7 +49,22 @@ extensions = [
 ]
 myst_enable_extensions = [
    "colon_fence",
+    "fieldlist",
 ]
+autodoc2_packages = [
+    {
+        "path": "../../vllm",
+        "exclude_dirs": ["__pycache__", "third_party"],
+    },
+]
+autodoc2_output_dir = "api"
+autodoc2_render_plugin = "myst"
+autodoc2_hidden_objects = ["dunder", "private", "inherited"]
+autodoc2_docstring_parser_regexes = [
+    (".*", "docs.source.autodoc2_docstring_parser"),
+]
+autodoc2_sort_names = True
+autodoc2_index_template = None

 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@ -77,6 +92,11 @@ html_theme_options = {
    'repository_url': 'https://github.com/vllm-project/vllm',
    'use_repository_button': True,
    'use_edit_page_button': True,
+    # Prevents the full API being added to the left sidebar of every page.
+    # Reduces build time by 2.5x and reduces build size from ~225MB to ~95MB.
+    'collapse_navbar': True,
+    # Makes API visible in the right sidebar on API reference pages.
+    'show_toc_level': 3,
 }
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
@ -164,73 +184,64 @@ def linkcode_resolve(domain, info):
        return None
    if not info['module']:
        return None
-    filename = info['module'].replace('.', '/')
-    module = info['module']

-    # try to determine the correct file and line number to link to
-    obj = sys.modules[module]
+    # Get path from module name
+    file = Path(f"{info['module'].replace('.', '/')}.py")
+    path = REPO_ROOT / file
+    if not path.exists():
+        path = REPO_ROOT / file.with_suffix("") / "__init__.py"
+    if not path.exists():
+        return None

-    # get as specific as we can
-    lineno: int = 0
-    filename: str = ""
-    try:
-        for part in info['fullname'].split('.'):
-            obj = getattr(obj, part)
+    # Get the line number of the object
+    with open(path) as f:
+        lines = f.readlines()
+    name = info['fullname'].split(".")[-1]
+    pattern = fr"^( {{4}})*((def|class) )?{name}\b.*"
+    for lineno, line in enumerate(lines, 1):
+        if not line or line.startswith("#"):
+            continue
+        if re.match(pattern, line):
+            break

-            # Skip decorator wrappers by checking if the object is a function
-            # and has a __wrapped__ attribute (which decorators typically set)
-            while hasattr(obj, '__wrapped__'):
-                obj = obj.__wrapped__
+    # If the line number is not found, return None
+    if lineno == len(lines):
+        return None

-            if not (inspect.isclass(obj) or inspect.isfunction(obj)
-                    or inspect.ismethod(obj)):
-                obj = obj.__class__  # Get the class of the instance
-
-            lineno = inspect.getsourcelines(obj)[1]
-            filename = (inspect.getsourcefile(obj)
-                        or f"{filename}.py").split("vllm/", 1)[1]
-    except Exception:
-        # For some things, like a class member, won't work, so
-        # we'll use the line number of the parent (the class)
-        pass
-
-    if filename.startswith("checkouts/"):
+    # If the line number is found, create the URL
+    filename = path.relative_to(REPO_ROOT)
+    if "checkouts" in path.parts:
        # a PR build on readthedocs
-        pr_number = filename.split("/")[1]
-        filename = filename.split("/", 2)[2]
+        pr_number = REPO_ROOT.name
        base, branch = get_repo_base_and_branch(pr_number)
        if base and branch:
            return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
-
    # Otherwise, link to the source file on the main branch
    return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"


-# Mock out external dependencies here, otherwise the autodoc pages may be blank.
+# Mock out external dependencies here, otherwise sphinx-argparse won't work.
 autodoc_mock_imports = [
+    "huggingface_hub",
+    "pydantic",
+    "zmq",
+    "cloudpickle",
+    "aiohttp",
+    "starlette",
    "blake3",
-    "compressed_tensors",
    "cpuinfo",
-    "cv2",
-    "torch",
    "transformers",
    "psutil",
-    "prometheus_client",
-    "sentencepiece",
    "vllm._C",
    "PIL",
    "numpy",
-    'triton',
    "tqdm",
-    "tensorizer",
-    "pynvml",
-    "outlines",
-    "xgrammar",
-    "librosa",
-    "soundfile",
-    "gguf",
-    "lark",
-    "decord",
+    # The mocks below are required by
+    # docs/source/serving/openai_compatible_server.md's
+    # vllm.entrypoints.openai.cli_args
+    "openai",
+    "fastapi",
+    "partial_json_parser",
 ]

 for mock_target in autodoc_mock_imports:
@ -241,18 +252,6 @@ for mock_target in autodoc_mock_imports:
            "been loaded into sys.modules when the sphinx build starts.",
            mock_target)

-
-class MockedClassDocumenter(autodoc.ClassDocumenter):
-    """Remove note about base class when a class is derived from object."""
-
-    def add_line(self, line: str, source: str, *lineno: int) -> None:
-        if line == "   Bases: :py:class:`object`":
-            return
-        super().add_line(line, source, *lineno)
-
-
-autodoc.ClassDocumenter = MockedClassDocumenter
-
 intersphinx_mapping = {
    "python": ("https://docs.python.org/3", None),
    "typing_extensions":
@ -264,7 +263,4 @@ intersphinx_mapping = {
    "psutil": ("https://psutil.readthedocs.io/en/stable", None),
 }

-autodoc_preserve_defaults = True
-autodoc_warningiserror = True
-
 navigation_with_keys = False
--- a/docs/source/contributing/deprecation_policy.md
+++ b/docs/source/contributing/deprecation_policy.md
@ -0,0 +1,87 @@
+# Deprecation Policy
+
+This document outlines the official policy and process for deprecating features
+in the vLLM project.
+
+## Overview
+
+vLLM uses a structured "deprecation pipeline" to guide the lifecycle of
+deprecated features. This policy ensures that users are given clear and
+sufficient notice when a feature is deprecated and that deprecations proceed in
+a consistent and predictable manner.
+
+We aim to strike a balance between continued innovation and respecting users’
+reliance on existing functionality. Deprecations are tied to our **minor (Y)
+releases** following semantic versioning (X.Y.Z), where:
+
+- **X** is a major version (rare)
+- **Y** is a minor version (used for significant changes, including deprecations/removals)
+- **Z** is a patch version (used for fixes and safer enhancements)
+
+Features that fall under this policy include (at a minimum) the following:
+
+- CLI flags
+- Environment variables
+- Configuration files
+- APIs in the OpenAI-compatible API server
+- Public Python APIs for the `vllm` library
+
+## Deprecation Pipeline
+
+The deprecation process consists of several clearly defined stages that span
+multiple Y releases:
+
+**1. Deprecated (Still On By Default)**
+
+- **Action**: Feature is marked as deprecated.
+- **Timeline**: A removal version is explicitly stated in the deprecation
+warning (e.g., "This will be removed in v0.10.0").
+- **Communication**: Deprecation is noted in the following, as applicable:
+  - Help strings
+  - Log output
+  - API responses
+  - `/metrics` output (for metrics features)
+  - User-facing documentation
+  - Release notes
+  - GitHub Issue (RFC) for feedback
+  - Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs
+
+**2.Deprecated (Off By Default)**
+
+- **Action**: Feature is disabled by default, but can still be re-enabled via a
+CLI flag or environment variable. Feature throws an error when used without
+re-enabling.
+- **Purpose**: Allows users who missed earlier warnings a temporary escape hatch
+while signaling imminent removal. Ensures any remaining usage is clearly
+surfaced and blocks silent breakage before full removal.
+
+**3. Removed**
+
+- **Action**: Feature is completely removed from the codebase.
+- **Note**: Only features that have passed through the previous deprecation
+stages will be removed.
+
+## Example Timeline
+
+Assume a feature is deprecated in `v0.9.0`.
+
+| Release       | Status                                                                                          |
+|---------------|-------------------------------------------------------------------------------------------------|
+| `v0.9.0`      | Feature is deprecated with clear removal version listed.                                        |
+| `v0.10.0`     | Feature is now off by default, throws an error when used, and can be re-enabled for legacy use. |
+| `v0.11.0`     | Feature is removed.                                                                             |
+
+## Important Guidelines
+
+- **No Removals in Patch Releases**: Removing deprecated features in patch
+(`.Z`) releases is disallowed to avoid surprising users.
+- **Grace Period for Existing Deprecations**: Any feature deprecated **before
+this policy** will have its grace period start **now**, not retroactively.
+- **Documentation is Critical**: Ensure every stage of the pipeline is
+documented clearly for users.
+
+## Final Notes
+
+This policy is a living document and may evolve as the needs of the project and
+its users change. Community feedback is welcome and encouraged as we refine the
+process.
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@ -17,7 +17,7 @@ Unsure on where to start? Check out the following links for tasks to work on:

 - [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
  - [Selected onboarding tasks](gh-project:6)
- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new%20model%22)
+- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new-model%22)
  - [Models with multi-modal capabilities](gh-project:10)

 ## License
@ -40,6 +40,10 @@ pre-commit install --hook-type pre-commit --hook-type commit-msg
 # You can manually run pre-commit with
 pre-commit run --all-files

+# To manually run something from CI that does not run
+# locally by default, you can run:
+pre-commit run mypy-3.9 --hook-stage manual --all-files
+
 # Unit tests
 pytest tests/
 ```
@ -54,6 +58,12 @@ Therefore, we recommend developing with Python 3.12 to minimise the chance of yo
 Currently, the repository is not fully checked by `mypy`.
 :::

+:::{note}
+Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
+platform to run unit tests locally, rely on the continuous integration system to run the tests for
+now.
+:::
+
 ## Issues

 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@ -12,5 +12,6 @@ lws
 modal
 open-webui
 skypilot
+streamlit
 triton
 :::
--- a/docs/source/deployment/frameworks/streamlit.md
+++ b/docs/source/deployment/frameworks/streamlit.md
@ -0,0 +1,42 @@
+(deployment-streamlit)=
+
+# Streamlit
+
+[Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps.
+
+It can be quickly integrated with vLLM as a backend API server, enabling powerful LLM inference via API calls.
+
+## Prerequisites
+
+- Setup vLLM environment
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```console
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```
+
+- Install streamlit and openai:
+
+```console
+pip install streamlit openai
+```
+
+- Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
+
+- Start the streamlit web UI and start to chat:
+
+```console
+streamlit run streamlit_openai_chatbot_webserver.py
+
+# or specify the VLLM_API_BASE or VLLM_API_KEY
+VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" streamlit run streamlit_openai_chatbot_webserver.py
+
+# start with debug mode to view more details
+streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
+```
+
+:::{image} /assets/deployment/streamlit-chat.png
+:::
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@ -52,8 +52,8 @@ for output in outputs:
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```

-More API details can be found in the {doc}`Offline Inference
-</api/offline_inference/index>` section of the API docs.
+More API details can be found in the [Offline Inference]
+(#offline-inference-api) section of the API docs.

 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.

--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@ -467,6 +467,9 @@ In general:
   hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics)
   for some time before deleting them.

+See the [deprecation policy](project:../../contributing/deprecation_policy.md) for
+the project-wide deprecation policy.
+
 ### Unimplemented - `vllm:tokens_total`

 Added by <gh-pr:4464>, but apparently never implemented. This can just be
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@ -16,7 +16,7 @@ In the example above, the KV cache in the first block can be uniquely identified

 * Parent hash value: The hash value of the parent hash block.
 * Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
-* Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below).
+* Extra hashes: Other values required to make this block unique, such as LoRA IDs, multi-modality input hashes (see the example below), and cache salts to isolate caches in multi-tenant environments.

 > **Note 1:** We only cache full blocks.

@ -76,6 +76,24 @@ Block 3

 In the rest of this document, we first introduce the data structure used for prefix caching in vLLM v1, followed by the prefix caching workflow of major KV cache operators (e.g., allocate, append, free, eviction). Finally, we use an example to illustrate the end to end prefix caching workflow.

+**Cache Isolation for Security**
+To improve privacy in shared environments, vLLM supports isolating prefix cache reuse through optional per-request salting. By including a `cache_salt` in the request, this value is injected into the hash of the first block, ensuring that only requests with the same salt can reuse cached KV blocks. This prevents timing-based attacks where an adversary could infer cached content by observing latency differences. This offers protection without compromising performance.
+
+```json
+{
+  "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Here is a document with details about the world series: ..."},
+    {"role": "user", "content": "Who won the world series in 2020?"}
+  ],
+  "cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ=="
+}
+```
+
+With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
+
+> **Note:** Cache isolation is not supported in engine V0.
+
 ## Data Structure

 The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@ -42,7 +42,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
  * [APC](#automatic-prefix-caching)
  * [LoRA](#lora-adapter)
  * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-  * [SD](#spec_decode)
+  * [SD](#spec-decode)
  * CUDA graph
  * <abbr title="Pooling Models">pooling</abbr>
  * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
@ -122,7 +122,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
  *
  *
  *
- * [SD](#spec_decode)
+- * [SD](#spec-decode)
  * ✅
  * ✅
  * ❌
@ -377,7 +377,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
  * ✅
  * [❌](gh-issue:8475)
  * ✅
- * [SD](#spec_decode)
+- * [SD](#spec-decode)
  * ✅
  * ✅
  * ✅
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@ -30,6 +30,7 @@ from vllm import LLM
 model = LLM("facebook/opt-125m", quantization="fp8")
 # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
 result = model.generate("Hello, my name is")
+print(result[0].outputs[0].text)
 ```

 :::{warning}
@ -44,6 +45,12 @@ To produce performant FP8 quantized models with vLLM, you'll need to install the
 pip install llmcompressor
 ```

+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process

 The quantization process involves three main steps:
@ -86,7 +93,7 @@ recipe = QuantizationModifier(
 # Apply the quantization algorithm.
 oneshot(model=model, recipe=recipe)

-# Save the model.
+# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
 model.save_pretrained(SAVE_DIR)
 tokenizer.save_pretrained(SAVE_DIR)
@ -94,18 +101,13 @@ tokenizer.save_pretrained(SAVE_DIR)

 ### 3. Evaluating Accuracy

-Install `vllm` and `lm-evaluation-harness`:
-
-```console
-pip install vllm lm-eval==0.4.4
-```
-
 Load and run the model in `vllm`:

 ```python
 from vllm import LLM
 model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
-model.generate("Hello my name is")
+result = model.generate("Hello my name is")
+print(result[0].outputs[0].text)
 ```

 Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
@ -188,4 +190,5 @@ from vllm import LLM
 model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/")
 # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB
 result = model.generate("Hello, my name is")
+print(result[0].outputs[0].text)
 ```
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@ -17,6 +17,7 @@ gptqmodel
 int4
 int8
 fp8
+modelopt
 quark
 quantized_kvcache
 torchao
--- a/docs/source/features/quantization/int4.md
+++ b/docs/source/features/quantization/int4.md
@ -18,6 +18,12 @@ To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](
 pip install llmcompressor
 ```

+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process

 The quantization process involves four main steps:
@ -87,7 +93,7 @@ oneshot(
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )

-# Save the compressed model
+# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@ -19,6 +19,12 @@ To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](
 pip install llmcompressor
 ```

+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process

 The quantization process involves four main steps:
@ -91,7 +97,7 @@ oneshot(
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )

-# Save the compressed model
+# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
--- a/docs/source/features/quantization/modelopt.md
+++ b/docs/source/features/quantization/modelopt.md
@ -0,0 +1,78 @@
+# NVIDIA TensorRT Model Optimizer
+
+The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models.
+
+We recommend installing the library with:
+
+```console
+pip install nvidia-modelopt
+```
+
+## Quantizing HuggingFace Models with PTQ
+
+You can quantize HuggingFace models using the example scripts provided in the TensorRT Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
+
+Below is an example showing how to quantize a model using modelopt's PTQ API:
+
+```python
+import modelopt.torch.quantization as mtq
+from transformers import AutoModelForCausalLM
+
+# Load the model from HuggingFace
+model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
+
+# Select the quantization config, for example, FP8
+config = mtq.FP8_DEFAULT_CFG
+
+# Define a forward loop function for calibration
+def forward_loop(model):
+    for data in calib_set:
+        model(data)
+
+# PTQ with in-place replacement of quantized modules
+model = mtq.quantize(model, config, forward_loop)
+```
+
+After the model is quantized, you can export it to a quantized checkpoint using the export API:
+
+```python
+import torch
+from modelopt.torch.export import export_hf_checkpoint
+
+with torch.inference_mode():
+    export_hf_checkpoint(
+        model,  # The quantized model.
+        export_dir,  # The directory where the exported files will be stored.
+    )
+```
+
+The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
+
+```python
+from vllm import LLM, SamplingParams
+
+def main():
+
+    model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+    # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+    llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
+
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+if __name__ == "__main__":
+    main()
+```
--- a/docs/source/features/quantization/quantized_kvcache.md
+++ b/docs/source/features/quantization/quantized_kvcache.md
@ -126,7 +126,7 @@ oneshot(
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )

-# Save quantized model
+# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
--- a/docs/source/features/quantization/quark.md
+++ b/docs/source/features/quantization/quark.md
@ -19,6 +19,12 @@ pip install amd-quark
 You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
 for more installation details.

+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process

 After installing Quark, we will use an example to illustrate how to use Quark.  
@ -150,6 +156,7 @@ LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
 export_config = ExporterConfig(json_export_config=JsonExporterConfig())
 export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP

+# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
 EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
 exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
 with torch.no_grad():
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@ -80,7 +80,7 @@ The table below shows the compatibility of various quantization implementations
  * ✅︎
  * ✅︎
  * ✅︎
-  * ✅︎
+  * ❌
  * ❌
  * ❌
  * ❌
@ -129,7 +129,17 @@ The table below shows the compatibility of various quantization implementations
  * ❌
  * ❌
  * ❌
-
+- * modelopt
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎︎
+  * ❌
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 :::

 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@ -15,16 +15,16 @@ vLLM currently supports the following reasoning models:
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
+| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |

 - IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.

 ## Quickstart

-To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
+To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.

 ```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --enable-reasoning --reasoning-parser deepseek_r1
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
 ```

 Next, make a request to the model that should return the reasoning content in the response.
@ -139,8 +139,7 @@ Remember to check whether the `reasoning_content` exists in the response before
 The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.

 ```bash
-VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --enable-reasoning --reasoning-parser deepseek_r1
+VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
 ```

 Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
@ -315,9 +314,8 @@ class DeepSeekReasoner(Reasoner):

 The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.

-Finally, you can enable reasoning for the model by using the `--enable-reasoning` and `--reasoning-parser` flags.
+Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags.

 ```bash
-vllm serve <model_tag> \
-    --enable-reasoning --reasoning-parser example
+vllm serve <model_tag> --reasoning-parser example
 ```
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@ -158,7 +158,7 @@ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
 Run the setup script:

 ```bash
-VLLM_TARGET_DEVICE="tpu" python setup.py develop
+VLLM_TARGET_DEVICE="tpu" python -m pip install -e .
 ```

 ## Set up using Docker
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@ -1,6 +1,6 @@
 # Installation

-vLLM contains pre-compiled C++ and CUDA (12.1) binaries.
+vLLM contains pre-compiled C++ and CUDA (12.6) binaries.

 ## Requirements

@ -23,12 +23,12 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
 You can install vLLM using either `pip` or `uv pip`:

 ```console
-# Install vLLM with CUDA 12.4.
+# Install vLLM with CUDA 12.6.
 pip install vllm # If you are using pip.
 uv pip install vllm # If you are using uv.
 ```

-As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.1, 11.8, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.6 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 11.8, and public PyTorch release versions:

 ```console
 # Install vLLM with CUDA 11.8.
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@ -73,7 +73,22 @@ Currently, there are no pre-built ROCm wheels.
    You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
    :::

-3. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
+3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps:
+
+    ```console
+    python3 -m pip uninstall -y aiter
+    git clone --recursive https://github.com/ROCm/aiter.git
+    cd aiter
+    git checkout $AITER_BRANCH_OR_COMMIT
+    git submodule sync; git submodule update --init --recursive
+    python3 setup.py develop
+    ```
+
+    :::{note}
+    You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose.
+    :::
+
+4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:

    ```bash
    $ pip install --upgrade pip
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@ -35,13 +35,6 @@ pip install -v -r requirements/xpu.txt
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```

- Finally, due to a known issue of conflict dependency(oneapi related) in torch-xpu 2.6 and ipex-xpu 2.6, we install ipex here. This will be fixed in the ipex-xpu 2.7.
-
-```console
-pip install intel-extension-for-pytorch==2.6.10+xpu \
-    --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-```
-
 :::{note}
 - FP16 is the default data type in the current XPU backend. The BF16 data
  type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
@ -81,5 +74,3 @@ python -m vllm.entrypoints.openai.api_server \
 ```

 By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
-
-There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -181,6 +181,7 @@ design/v1/metrics
 :maxdepth: 2

 contributing/overview
+contributing/deprecation_policy
 contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
 contributing/model/index
@ -193,11 +194,8 @@ contributing/vulnerability_management
 :caption: API Reference
 :maxdepth: 2

-api/offline_inference/index
-api/engine/index
-api/inference_params
-api/multimodal/index
-api/model/index
+api/summary
+api/vllm/vllm
 :::

 % Latest news and acknowledgements
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@ -14,7 +14,7 @@ Usually, this is automatically inferred so you don't have to specify it.
 ## Offline Inference

 The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+See <project:#configuration> for a list of options when initializing the model.

 ### `LLM.generate`

--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@ -60,7 +60,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ## Offline Inference

 The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+See <project:#configuration> for a list of options when initializing the model.

 ### `LLM.encode`

--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -542,8 +542,8 @@ See [this page](#generative-models) for more information on how to use generativ
  * ✅︎
 - * `Qwen3MoeForCausalLM`
  * Qwen3MoE
-  * `Qwen/Qwen3-MoE-15B-A2B`, etc.
-  * ✅︎
+  * `Qwen/Qwen3-30B-A3B`, etc.
+  *
  * ✅︎
 - * `StableLmForCausalLM`
  * StableLM
@ -979,11 +979,18 @@ See [this page](#generative-models) for more information on how to use generativ
  * ✅︎
  * ✅︎
  * ✅︎
+- * `MiniMaxVL01ForConditionalGeneration`
+  * MiniMax-VL
+  * T + I<sup>E+</sup>
+  * `MiniMaxAI/MiniMax-VL-01`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `Mistral3ForConditionalGeneration`
  * Mistral3
  * T + I<sup>+</sup>
  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
-  *
+  * ✅︎
  * ✅︎
  * ✅︎
 - * `MllamaForConditionalGeneration`
@ -1007,6 +1014,13 @@ See [this page](#generative-models) for more information on how to use generativ
  *
  * ✅︎
  * ✅︎
+- * `Ovis2ForConditionalGeneration`<sup>^</sup>
+  * Ovis2
+  * T + I<sup>+</sup>
+  * `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis2-2B`, etc.
+  *
+  *
+  * ✅︎
 - * `PaliGemmaForConditionalGeneration`
  * PaliGemma, PaliGemma 2
  * T + I<sup>E</sup>
--- a/docs/source/performance/optimization.md
+++ b/docs/source/performance/optimization.md
@ -2,65 +2,188 @@

 # Optimization and Tuning

+This guide covers optimization strategies and performance tuning for vLLM V1.
+
 ## Preemption

 Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
-The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
-available again. When this occurs, the following warning is printed:
+In such cases, vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
+available again. When this occurs, you may see the following warning:

 ```text
-WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
+WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.RECOMPUTE mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
 ```

 While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency.
-If you frequently encounter preemptions from the vLLM engine, consider the following actions:
+If you frequently encounter preemptions, consider the following actions:

- Increase `gpu_memory_utilization`. The vLLM pre-allocates GPU cache by using gpu_memory_utilization% of memory. By increasing this utilization, you can provide more KV cache space.
- Decrease `max_num_seqs` or `max_num_batched_tokens`. This can reduce the number of concurrent requests in a batch, thereby requiring less KV cache space.
- Increase `tensor_parallel_size`. This approach shards model weights, so each GPU has more memory available for KV cache.
- Increase `pipeline_parallel_size`. This approach distributes model layers across GPUs, reducing the memory needed for model weights on each GPU, which indirectly leaves more memory available for KV cache.
+- Increase `gpu_memory_utilization`. vLLM pre-allocates GPU cache using this percentage of memory. By increasing utilization, you can provide more KV cache space.
+- Decrease `max_num_seqs` or `max_num_batched_tokens`. This reduces the number of concurrent requests in a batch, thereby requiring less KV cache space.
+- Increase `tensor_parallel_size`. This shards model weights across GPUs, allowing each GPU to have more memory available for KV cache. However, increasing this value may cause excessive synchronization overhead.
+- Increase `pipeline_parallel_size`. This distributes model layers across GPUs, reducing the memory needed for model weights on each GPU, indirectly leaving more memory available for KV cache. However, increasing this value may cause latency penalties.

-You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
+You can monitor the number of preemption requests through Prometheus metrics exposed by vLLM. Additionally, you can log the cumulative number of preemption requests by setting `disable_log_stats=False`.
+
+In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.

 (chunked-prefill)=

 ## Chunked Prefill

-vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
+Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.

-You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
+In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.

-```python
-from vllm import LLM
-
-llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
-# Set max_num_batched_tokens to tune performance.
-# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
-# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=2048)
-```
-
-By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch.
-This policy optimizes the TTFT (time to the first token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
-
-Once chunked prefill is enabled, the policy is changed to prioritize decode requests.
-It batches all pending decode requests to the batch before scheduling any prefill.
-When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills.
-If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
+With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.

 This policy has two benefits:

 - It improves ITL and generation decode because decode requests are prioritized.
 - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.

-You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 2048.
-Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes.
-Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch.
+### Performance Tuning with Chunked Prefill

- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
- Note that the default value (2048) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
+You can tune the performance by adjusting `max_num_batched_tokens`:

-We recommend you set `max_num_batched_tokens > 2048` for throughput.
+- Smaller values (e.g., 2048) achieve better inter-token latency (ITL) because there are fewer prefills slowing down decodes.
+- Higher values achieve better time to first token (TTFT) as you can process more prefill tokens in a batch.
+- For optimal throughput, we recommend setting `max_num_batched_tokens > 8096` especially for smaller models on large GPUs.
+- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes).
+
+```python
+from vllm import LLM
+
+# Set max_num_batched_tokens to tune performance
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", max_num_batched_tokens=16384)
+```

 See related papers for more details (<https://arxiv.org/pdf/2401.08671> or <https://arxiv.org/pdf/2308.16369>).

-Please try out this feature and let us know your feedback via GitHub issues!
+## Parallelism Strategies
+
+vLLM supports multiple parallelism strategies that can be combined to optimize performance across different hardware configurations.
+
+### Tensor Parallelism (TP)
+
+Tensor parallelism shards model parameters across multiple GPUs within each model layer. This is the most common strategy for large model inference within a single node.
+
+**When to use:**
+
+- When the model is too large to fit on a single GPU
+- When you need to reduce memory pressure per GPU to allow more KV cache space for higher throughput
+
+```python
+from vllm import LLM
+
+# Split model across 4 GPUs
+llm = LLM(model="meta-llama/Llama-3.3-70B-Instruct", tensor_parallel_size=4)
+```
+
+For models that are too large to fit on a single GPU (like 70B parameter models), tensor parallelism is essential.
+
+### Pipeline Parallelism (PP)
+
+Pipeline parallelism distributes model layers across multiple GPUs. Each GPU processes different parts of the model in sequence.
+
+**When to use:**
+
+- When you've already maxed out efficient tensor parallelism but need to distribute the model further, or across nodes
+- For very deep and narrow models where layer distribution is more efficient than tensor sharding
+
+Pipeline parallelism can be combined with tensor parallelism for very large models:
+
+```python
+from vllm import LLM
+
+# Combine pipeline and tensor parallelism
+llm = LLM(
+    model="meta-llama/Llama-3.3-70B-Instruct,
+    tensor_parallel_size=4,
+    pipeline_parallel_size=2
+)
+```
+
+### Expert Parallelism (EP)
+
+Expert parallelism is a specialized form of parallelism for Mixture of Experts (MoE) models, where different expert networks are distributed across GPUs.
+
+**When to use:**
+
+- Specifically for MoE models (like DeepSeekV3, Qwen3MoE, Llama-4)
+- When you want to balance the expert computation load across GPUs
+
+Expert parallelism is enabled by setting `enable_expert_parallel=True`, which will use expert parallelism instead of tensor parallelism for MoE layers.
+It will use the same degree of parallelism as what you have set for tensor parallelism.
+
+### Data Parallelism (DP)
+
+Data parallelism replicates the entire model across multiple GPU sets and processes different batches of requests in parallel.
+
+**When to use:**
+
+- When you have enough GPUs to replicate the entire model
+- When you need to scale throughput rather than model size
+- In multi-user environments where isolation between request batches is beneficial
+
+Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`.
+Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.
+
+## Reducing Memory Usage
+
+If you encounter out-of-memory issues, consider these strategies:
+
+### Context Length and Batch Size
+
+You can reduce memory usage by limiting the context length and batch size:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    max_model_len=2048,  # Limit context window
+    max_num_seqs=4       # Limit batch size
+)
+```
+
+### Adjust CUDA Graph Compilation
+
+CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level:
+
+```python
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationLevel
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        cudagraph_capture_sizes=[1, 2, 4, 8]  # Capture fewer batch sizes
+    )
+)
+```
+
+Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    enforce_eager=True  # Disable CUDA graph compilation
+)
+```
+
+### Multimodal Models
+
+For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request:
+
+```python
+from vllm import LLM
+
+# Accept up to 2 images per prompt
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={"image": 2}
+)
+```
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@ -25,7 +25,7 @@ The available APIs depend on the type of model that is being run:
 Please refer to the above pages for more details about each API.

 :::{seealso}
-[API Reference](/api/offline_inference/index)
+[API Reference](#offline-inference-api)
 :::

 (configuration-options)=
@ -33,7 +33,7 @@ Please refer to the above pages for more details about each API.
 ## Configuration Options

 This section lists the most common options for running the vLLM engine.
-For a full list, refer to the [Engine Arguments](#engine-args) page.
+For a full list, refer to the <project:#configuration> page.

 (model-resolution)=

--- a/examples/lmcache/README.md
+++ b/examples/lmcache/README.md
@ -44,8 +44,8 @@ The main script generates several log files:

 ## 2. CPU Offload Examples

- `cpu_offload_lmcache_v0.py` - CPU offloading implementation for vLLM v0
- `cpu_offload_lmcache_v1.py` - CPU offloading implementation for vLLM v1
+- `python cpu_offload_lmcache.py -v v0` - CPU offloading implementation for vLLM v0
+- `python cpu_offload_lmcache.py -v v1` - CPU offloading implementation for vLLM v1

 ## 3. KV Cache Sharing

--- a/examples/lmcache/cpu_offload_lmcache_v0.py
+++ b/examples/lmcache/cpu_offload_lmcache_v0.py
@ -1,22 +1,37 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 This file demonstrates the example usage of cpu offloading
-with LMCache.
+with LMCache in vLLM v1 or v0.
+
+Usage:
+
+    Specify vLLM version
+
+    -v v0 : Use LMCacheConnector
+            model = mistralai/Mistral-7B-Instruct-v0.2
+            (Includes enable_chunked_prefill = True)
+
+    -v v1 : Use LMCacheConnectorV1 (default)
+            model = meta-llama/Meta-Llama-3.1-8B-Instruct
+            (Without enable_chunked_prefill)

 Note that `lmcache` is needed to run this example.
 Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
 Learn more about LMCache environment setup, please refer to:
 https://docs.lmcache.ai/getting_started/installation.html
 """
+import argparse
 import contextlib
 import os
 import time
+from dataclasses import asdict

 from lmcache.experimental.cache_engine import LMCacheEngineBuilder
 from lmcache.integration.vllm.utils import ENGINE_NAME

 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
+from vllm.engine.arg_utils import EngineArgs


 def setup_environment_variables():
@ -32,18 +47,32 @@ def setup_environment_variables():


@contextlib.contextmanager
-def build_llm_with_lmcache():
-    ktc = KVTransferConfig.from_cli(
-        '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
+def build_llm_with_lmcache(lmcache_connector: str, model: str,
+                           vllm_version: str):
+    ktc = KVTransferConfig(
+        kv_connector=lmcache_connector,
+        kv_role="kv_both",
+    )
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # memory. Reduce the value if your GPU has less memory.
    # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
-              kv_transfer_config=ktc,
-              max_model_len=8000,
-              enable_chunked_prefill=True,
-              gpu_memory_utilization=0.8)
+    if vllm_version == "v0":
+        llm_args = EngineArgs(
+            model=model,
+            kv_transfer_config=ktc,
+            max_model_len=8000,
+            gpu_memory_utilization=0.8,
+            enable_chunked_prefill=True,  # Only in v0
+        )
+    else:
+        llm_args = EngineArgs(
+            model=model,
+            kv_transfer_config=ktc,
+            max_model_len=8000,
+            gpu_memory_utilization=0.8,
+        )

+    llm = LLM(**asdict(llm_args))
    try:
        yield llm
    finally:
@ -57,6 +86,9 @@ def print_output(
    sampling_params: SamplingParams,
    req_str: str,
 ):
+    # Should be able to see logs like the following:
+    # `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
+    # This indicates that the KV cache has been stored in LMCache.
    start = time.time()
    outputs = llm.generate(prompt, sampling_params)
    print("-" * 50)
@ -68,10 +100,29 @@ def print_output(
    print("-" * 50)


+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-v",
+                        "--version",
+                        choices=["v0", "v1"],
+                        default="v1",
+                        help="Specify vLLM version (default: v1)")
+    return parser.parse_args()
+
+
 def main():
+    args = parse_args()
+
+    if args.version == "v0":
+        lmcache_connector = "LMCacheConnector"
+        model = "mistralai/Mistral-7B-Instruct-v0.2"
+    else:
+        lmcache_connector = "LMCacheConnectorV1"
+        model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+
    setup_environment_variables()

-    with build_llm_with_lmcache() as llm:
+    with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:

        # This example script runs two requests with a shared prefix.
        # Define the shared prompt and specific prompts
--- a/examples/lmcache/cpu_offload_lmcache_v1.py
+++ b/examples/lmcache/cpu_offload_lmcache_v1.py
@ -1,57 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-This file demonstrates the example usage of cpu offloading
-with LMCache in vLLM v1.
-
-Note that lmcache needs to be installed to run this example.
-Learn more about LMCache in https://github.com/LMCache/LMCache.
-"""
-import os
-
-from lmcache.experimental.cache_engine import LMCacheEngineBuilder
-from lmcache.integration.vllm.utils import ENGINE_NAME
-
-from vllm import LLM, SamplingParams
-from vllm.config import KVTransferConfig
-
-# LMCache-related environment variables
-# Use experimental features in LMCache
-os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
-# LMCache is set to use 256 tokens per chunk
-os.environ["LMCACHE_CHUNK_SIZE"] = "256"
-# Enable local CPU backend in LMCache
-os.environ["LMCACHE_LOCAL_CPU"] = "True"
-# Set local CPU memory limit to 5.0 GB
-os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
-
-# This example script runs two requests with a shared prefix.
-shared_prompt = "Hello, how are you?" * 1000
-first_prompt = [
-    shared_prompt + "Hello, my name is",
-]
-second_prompt = [
-    shared_prompt + "Tell me a very long story",
-]
-
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
-
-ktc = KVTransferConfig.from_cli(
-    '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
-# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
-# memory. Reduce the value if your GPU has less memory.
-# Note that LMCache is not compatible with chunked prefill for now.
-llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-          kv_transfer_config=ktc,
-          max_model_len=8000,
-          gpu_memory_utilization=0.8)
-
-# Should be able to see logs like the following:
-# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
-# This indicates that the KV cache has been stored in LMCache.
-outputs = llm.generate(first_prompt, sampling_params)
-for output in outputs:
-    generated_text = output.outputs[0].text
-    print(f"Generated text: {generated_text!r}")
-
-# Clean up lmcache backend
-LMCacheEngineBuilder.destroy(ENGINE_NAME)
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@ -36,6 +36,10 @@ def parse_args():
        help="downloaded from the eagle repo " \
        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
    )
+    parser.add_argument("--method",
+                        type=str,
+                        default='eagle',
+                        choices=['eagle', 'eagle3'])
    parser.add_argument("--max_num_seqs", type=int, default=8)
    parser.add_argument("--num_prompts", type=int, default=80)
    parser.add_argument("--num_spec_tokens", type=int, default=2)
@ -53,7 +57,13 @@ def main():
    args = parse_args()

    model_dir = "meta-llama/Llama-3.1-8B-Instruct"
-    eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+
+    if args.method == 'eagle':
+        eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+    elif args.method == 'eagle3':
+        eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+    else:
+        raise ValueError(f"unknown method: {args.method}")

    max_model_len = 2048

@ -81,7 +91,7 @@ def main():
        max_num_seqs=args.max_num_seqs,
        gpu_memory_utilization=0.8,
        speculative_config={
-            "method": "eagle3" if "eagle3" in eagle_dir.lower() else "eagle",
+            "method": args.method,
            "model": eagle_dir,
            "num_speculative_tokens": args.num_spec_tokens,
            "draft_tensor_parallel_size": args.draft_tp,
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@ -14,7 +14,7 @@ import tqdm

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.profiler import layerwise_profile
+from vllm.profiler.layerwise_profile import layerwise_profile
 from vllm.utils import FlexibleArgumentParser

 BATCH_SIZE_DEFAULT = 1
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@ -47,8 +47,7 @@ def get_mixed_modalities_query() -> QueryResult:
                "image":
                ImageAsset("cherry_blossom").pil_image.convert("RGB"),
                "video":
-                VideoAsset(name="sample_demo_1.mp4",
-                           num_frames=16).np_ndarrays,
+                VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
            },
        },
        limit_mm_per_prompt={
@ -66,7 +65,7 @@ def get_use_audio_in_video_query() -> QueryResult:
              "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
              f"{question}<|im_end|>\n"
              f"<|im_start|>assistant\n")
-    asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16)
+    asset = VideoAsset(name="baby_reading", num_frames=16)
    audio = asset.get_audio(sampling_rate=16000)
    assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
                                  "Please launch this example with "
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -725,6 +725,34 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
    )


+# Ovis2
+def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "AIDC-AI/Ovis2-1B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    placeholder = "<image>\n"
+    prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+                f"<|im_start|>user\n{placeholder}"
+                f"{question}<|im_end|>\n"
+                "<|im_start|>assistant\n") for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # PaliGemma
 def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -1041,6 +1069,7 @@ model_example_map = {
    "llama4": run_llama4,
    "molmo": run_molmo,
    "NVLM_D": run_nvlm_d,
+    "ovis2": run_ovis2,
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
@ -1080,7 +1109,7 @@ def get_multi_modal_input(args):

    if args.modality == "video":
        # Input video and question
-        video = VideoAsset(name="sample_demo_1.mp4",
+        video = VideoAsset(name="baby_reading",
                           num_frames=args.num_frames).np_ndarrays
        vid_questions = ["Why is this video funny?"]

--- a/Show More
+++ b/Show More