wip

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
enable piecewise cudagraphs for eagle
2025-09-17 22:41:43 +00:00 · 2025-09-17 20:48:14 +00:00 · 2025-09-17 20:33:11 +00:00 · 2025-09-17 12:37:29 -07:00 · 2025-09-17 11:53:12 -07:00 · 2025-09-17 11:15:05 -07:00
789 changed files with 41893 additions and 31017 deletions
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -8,7 +8,7 @@ This benchmark aims to:

 Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.

-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)

 ## Setup

--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,24 +1,22 @@
 steps:
  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build arm64 wheel - CUDA 12.9"
+    depends_on: ~
    id: build-wheel-arm64-cuda-12-9
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  - block: "Build CUDA 12.8 wheel"
-    key: block-build-cu128-wheel
-
  - label: "Build wheel - CUDA 12.8"
-    depends_on: block-build-cu128-wheel
+    depends_on: ~
    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
@ -30,12 +28,8 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  - block: "Build CUDA 12.6 wheel"
-    key: block-build-cu126-wheel
-    depends_on: ~
-
  - label: "Build wheel - CUDA 12.6"
-    depends_on: block-build-cu126-wheel
+    depends_on: ~
    id: build-wheel-cuda-12-6
    agents:
      queue: cpu_queue_postmerge
@ -102,8 +96,6 @@ steps:
    depends_on:
      - create-multi-arch-manifest
      - build-wheel-cuda-12-8
-      - build-wheel-cuda-12-6
-      - build-wheel-cuda-12-9
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel:
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`

 To download and upload the image:

 \`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
-docker tag vllm/vllm-openai vllm/vllm-openai:latest
-docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
-docker push vllm/vllm-openai:latest
-docker push vllm/vllm-openai:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
 EOF 
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -66,7 +66,6 @@ function cpu_tests() {

    pytest -x -v -s tests/models/language/pooling -m cpu_model
    pytest -x -v -s tests/models/multimodal/generation \
-                --ignore=tests/models/multimodal/generation/test_mllama.py \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"

--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -30,6 +30,7 @@ docker run \
    bash -c '
    set -e
    echo $ZE_AFFINITY_MASK
+    pip install tblib==3.1.0
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -46,7 +46,6 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
-  - tests/mq_llm_engine
  - tests/async_engine
  - tests/test_inputs.py
  - tests/test_outputs.py
@ -54,15 +53,16 @@ steps:
  - tests/utils_
  - tests/worker
  - tests/standalone_tests/lazy_imports.py
+  - tests/transformers_utils
  commands:
  - python3 standalone_tests/lazy_imports.py
-  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s multimodal
  - pytest -v -s utils_ # Utils
  - pytest -v -s worker # Worker
+  - pytest -v -s transformers_utils # transformers_utils

 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@ -91,18 +91,18 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Core Test # 22min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+- label: Entrypoints Unit Tests # 5min
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
  fast_check: true
  source_file_dependencies:
-  - vllm/core
-  - vllm/distributed
-  - tests/core
+  - vllm/entrypoints
+  - tests/entrypoints/
  commands:
-  - pytest -v -s core
+  - pytest -v -s entrypoints/openai/tool_parsers
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling

- label: Entrypoints Test (LLM) # 30min
+- label: Entrypoints Integration Test (LLM) # 30min
  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
@ -119,7 +119,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server) # 100min
  timeout_in_minutes: 130
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
@ -132,9 +132,22 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
  - pytest -v -s entrypoints/test_chat_utils.py

+- label: Entrypoints Integration Test (Pooling)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/pooling
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/pooling
+
 - label: Distributed Tests (4 GPUs) # 35min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
@ -205,7 +218,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/metrics
-  - tests/tracing
+  - tests/v1/tracing
  commands:
  - pytest -v -s metrics
  - "pip install \
@ -310,7 +323,6 @@ steps:
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
@ -369,6 +381,7 @@ steps:
    - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
+    - pytest -v -s compile/test_noop_elimination.py

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@ -379,11 +392,7 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_basic_correctness.py
-  # these tests need to be separated, cannot combine
-  - pytest -v -s compile/piecewise/test_simple.py
-  - pytest -v -s compile/piecewise/test_toy_llama.py
-  - pytest -v -s compile/piecewise/test_full_cudagraph.py
-  - pytest -v -s compile/piecewise/test_multiple_graphs.py
+  - pytest -v -s compile/piecewise/

 - label: PyTorch Fullgraph Test # 20min
  timeout_in_minutes: 30
@ -501,6 +510,10 @@ steps:
  commands:
  # temporary install here since we need nightly, will move to requirements/test.in
  # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

@ -523,15 +536,6 @@ steps:
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/

- label: Encoder Decoder tests # 12min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/encoder_decoder
-  commands:
-    - pytest -v -s encoder_decoder
-
 - label: OpenAI-Compatible Tool Use # 23 min
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
@ -546,36 +550,85 @@ steps:

 #####  models test  #####

- label: Basic Models Test # 57min
-  timeout_in_minutes: 75
+- label: Basic Models Tests (Initialization)
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
-  - tests/models
+  - tests/models/test_initialization.py
  commands:
-    - pytest -v -s models/test_transformers.py
-    - pytest -v -s models/test_registry.py
-    - pytest -v -s models/test_utils.py
-    - pytest -v -s models/test_vision.py
-    - pytest -v -s models/test_initialization.py
+    # Run a subset of model initialization tests
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset

- label: Language Models Test (Standard) # 35min
+- label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/test_initialization.py
+  commands:
+    # Only when vLLM model source is modified - test initialization of a large
+    # subset of supported models (the complement of the small subset in the above
+    # test.) Also run if model initialization test file is modified
+    - pytest -v -s models/test_initialization.py \
+             -k 'not test_can_initialize_small_subset' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Basic Models Tests (Other)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_transformers.py
+  - tests/models/test_registry.py
+  - tests/models/test_utils.py
+  - tests/models/test_vision.py
+  commands:
+    - pytest -v -s models/test_transformers.py \
+                   models/test_registry.py \
+                   models/test_utils.py \
+                   models/test_vision.py
+
+- label: Language Models Tests (Standard)
+  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
  - vllm/
  - tests/models/language
  commands:
+    # Test standard language models, excluding a subset of slow tests
    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m core_model
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'

- label: Language Models Test (Hybrid) # 35 min
+- label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    # Shard slow subset of standard language models tests. Only run when model
+    # source is modified, or when specified test files are modified
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
@ -583,7 +636,12 @@ steps:
    # Note: also needed to run plamo2 model in vLLM
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    - pytest -v -s models/language/generation -m hybrid_model
+    # Shard hybrid language model tests
+    - pytest -v -s models/language/generation \
+                   -m hybrid_model \
+                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+                   --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2

 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
@ -597,6 +655,16 @@ steps:
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

+- label: Language Models Test (PPL)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation_ppl_test
+  commands:
+    - pytest -v -s models/language/generation_ppl_test
+
 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
@ -607,6 +675,16 @@ steps:
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'

+- label: Language Models Test (MTEB)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling_mteb_test
+  commands:
+    - pytest -v -s models/language/pooling_mteb_test
+
 - label: Multi-Modal Processor Test # 44min
  timeout_in_minutes: 60
  source_file_dependencies:
@ -627,7 +705,7 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
-    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
@ -713,7 +791,8 @@ steps:
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-    - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
@ -729,6 +808,20 @@ steps:
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py

+- label: GPT-OSS Eval (Blackwell)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  # optional: true
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####

@ -743,6 +836,8 @@ steps:
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py
+  - pytest -v -s distributed/test_shm_buffer.py
+  - pytest -v -s distributed/test_shm_storage.py

 - label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
@ -801,7 +896,8 @@ steps:
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
  # test sequence parallel
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
@ -827,7 +923,7 @@ steps:
  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
-  - pip uninstall prithvi_io_processor_plugin -y 
+  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
@ -851,7 +947,6 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
-  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support

 - label: LoRA TP Test (Distributed) # 17 min
  timeout_in_minutes: 30
@ -875,7 +970,7 @@ steps:
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2 
+  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
@ -925,9 +1020,21 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

- label: Qwen MoE EP Test # optional
+##### H200 test #####
+- label: Distrubted Tests (H200) # optional
  gpu: h200
  optional: true
+  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,32 @@
+[run]
+source = vllm
+omit =
+    */tests/*
+    */test_*
+    */__pycache__/*
+    */build/*
+    */dist/*
+    */vllm.egg-info/*
+    */third_party/*
+    */examples/*
+    */benchmarks/*
+    */docs/*
+
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    if settings.DEBUG
+    raise AssertionError
+    raise NotImplementedError
+    if 0:
+    if __name__ == .__main__.:
+    class .*\bProtocol\):
+    @(abc\.)?abstractmethod
+
+[html]
+directory = htmlcov
+
+[xml]
+output = coverage.xml
--- a/.github/.bc-linter.yml
+++ b/.github/.bc-linter.yml
@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+#   - "vllm/v1/attetion/*.py"
+#   - "vllm/v1/core/*.py"
+exclude:
+  - "**/*.py"
+
+scan:
+  functions: true        # check free functions and methods
+  classes: true          # check classes/dataclasses
+  public_only: true      # ignore names starting with "_" at any level
+
+annotations:
+  include:               # decorators that force‑include a symbol
+    - name: "bc_linter_include"  # matched by simple name or dotted suffix
+      propagate_to_members: false # for classes, include methods/inner classes
+  exclude:               # decorators that force‑exclude a symbol
+    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
+      propagate_to_members: true  # for classes, exclude methods/inner classes
+
+excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,23 +2,27 @@
 # for more info about CODEOWNERS file

 # This lists cover the "core" components of vLLM that require careful review
+/vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/fused_moe @mgoin
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
-/vllm/multimodal @DarkLight1337 @ywang96
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/v1/attention @LucasWilkinson
 /vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
@ -29,7 +33,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/kv_cache_interface.py @heheda12345
+/vllm/v1/offloading @ApostaC

 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@ -37,18 +45,23 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/evals @mgoin
+/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
+/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
+/tests/v1/kv_connector/nixl_integration @NickLucche 
+/tests/v1/kv_connector @ApostaC
+/tests/v1/offloading @ApostaC

 # Docs
 /docs @hmellor
@ -91,3 +104,12 @@ mkdocs.yaml @hmellor
 /vllm/v1/attention/backends/mla/rocm*.py @gshtras
 /vllm/attention/ops/rocm*.py @gshtras
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+
+# TPU
+/vllm/v1/worker/tpu* @NickLucche
+/vllm/platforms/tpu.py @NickLucche
+/vllm/v1/sample/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
+
+# KVConnector installation files
+/requirements/kv_connectors.txt @NickLucche
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -124,9 +124,16 @@ pull_request_rules:
    - or:
      - files~=^examples/.*gpt[-_]?oss.*\.py
      - files~=^tests/.*gpt[-_]?oss.*\.py
+      - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
+      - files~=^tests/entrypoints/test_context.py
      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
+      - files~=^vllm/entrypoints/harmony_utils.py
+      - files~=^vllm/entrypoints/tool_server.py
+      - files~=^vllm/entrypoints/tool.py
+      - files~=^vllm/entrypoints/context.py
      - title~=(?i)gpt[-_]?oss
+      - title~=(?i)harmony
  actions:
    label:
      add:
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@ -0,0 +1,29 @@
+name: BC Lint
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - labeled
+      - unlabeled
+
+jobs:
+  bc_lint:
+    if: github.repository_owner == 'vllm-project'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run BC Lint Action
+        uses: pytorch/test-infra/.github/actions/bc-lint@main
+        with:
+          repo: ${{ github.event.pull_request.head.repo.full_name }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
+          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+          config_dir: .github
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/.yapfignore
+++ b/.yapfignore
@ -1 +1,2 @@
 collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
 # cmake --install . --component _C
 project(vllm_extensions LANGUAGES CXX)

+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
@ -171,6 +175,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()

+#
+# Set CUDA include flags for CXX compiler.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
+  if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
+  endif()
+endif()
+
 #
 # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
 # setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@ -294,7 +308,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp"
-    "csrc/attention/mla/cutlass_mla_entry.cu"
    "csrc/quantization/fp8/per_token_group_quant.cu")

  set_gencode_flags_for_srcs(
@ -581,7 +594,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
    set(SRCS
-      "csrc/attention/mla/cutlass_mla_kernels.cu"
      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -779,6 +791,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

+  # Hadacore kernels
+  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+  if(HADACORE_ARCHS)
+    set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${HADACORE_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    message(STATUS "Building hadacore")
+  endif()
+
 # if CUDA endif
 endif()

--- a/README.md
+++ b/README.md
@ -14,6 +14,9 @@ Easy, fast, and cheap LLM serving for everyone
 | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

+---
+Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
+
 ---

 *Latest News* 🔥
@ -78,7 +81,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -1,807 +1,20 @@
-# Benchmarking vLLM
+# Benchmarks

-This README guides you through running benchmark tests with the extensive
-datasets supported on vLLM. It’s a living document, updated as new features and datasets
-become available.
+This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.

-## Dataset Overview
+## Contents

-<table style="width:100%; border-collapse: collapse;">
-  <thead>
-    <tr>
-      <th style="width:15%; text-align: left;">Dataset</th>
-      <th style="width:10%; text-align: center;">Online</th>
-      <th style="width:10%; text-align: center;">Offline</th>
-      <th style="width:65%; text-align: left;">Data Path</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td><strong>ShareGPT</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
-    </tr>
-    <tr>
-      <td><strong>ShareGPT4V (Image)</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td>
-        <code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
-        <br>
-        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
-        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
-      </td>
-    </tr>
-        <tr>
-      <td><strong>ShareGPT4Video (Video)</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td>
-        <code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
-      </td>
-    </tr>
-    <tr>
-      <td><strong>BurstGPT</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
-    </tr>
-    <tr>
-      <td><strong>Sonnet (deprecated)</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td>Local file: <code>benchmarks/sonnet.txt</code></td>
-    </tr>
-    <tr>
-      <td><strong>Random</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>synthetic</code></td>
-    </tr>
-    <tr>
-      <td><strong>RandomMultiModal (Image/Video)</strong></td>
-      <td style="text-align: center;">🟡</td>
-      <td style="text-align: center;">🚧</td>
-      <td><code>synthetic</code> </td>
-    </tr>
-    <tr>
-      <td><strong>Prefix Repetition</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>synthetic</code></td>
-    </tr>
-    <tr>
-      <td><strong>HuggingFace-VisionArena</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>lmarena-ai/VisionArena-Chat</code></td>
-    </tr>
-    <tr>
-      <td><strong>HuggingFace-InstructCoder</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>likaixin/InstructCoder</code></td>
-    </tr>
-      <tr>
-      <td><strong>HuggingFace-AIMO</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
-    </tr>
-    <tr>
-      <td><strong>HuggingFace-Other</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
-    </tr>
-    <tr>
-      <td><strong>Custom</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td>Local file: <code>data.jsonl</code></td>
-    </tr>
-  </tbody>
-</table>
+- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
+- **Throughput benchmarks**: Scripts for testing offline batch inference performance
+- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
+- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)

-✅: supported
+## Usage

-🟡: Partial support
+For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).

-🚧: to be supported
+For full CLI reference see:

-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
-For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
-
-```bash
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
-```
-
-## 🚀 Example - Online Benchmark
-
-<details>
-<summary>Show more</summary>
-
-<br/>
-
-First start serving your model
-
-```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B
-```
-
-Then run the benchmarking script
-
-```bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-vllm bench serve \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --endpoint /v1/completions \
-  --dataset-name sharegpt \
-  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --num-prompts 10
-```
-
-If successful, you will see the following output
-
-```text
-============ Serving Benchmark Result ============
-Successful requests:                     10
-Benchmark duration (s):                  5.78
-Total input tokens:                      1369
-Total generated tokens:                  2212
-Request throughput (req/s):              1.73
-Output token throughput (tok/s):         382.89
-Total Token throughput (tok/s):          619.85
---------------Time to First Token----------------
-Mean TTFT (ms):                          71.54
-Median TTFT (ms):                        73.88
-P99 TTFT (ms):                           79.49
-----Time per Output Token (excl. 1st token)------
-Mean TPOT (ms):                          7.91
-Median TPOT (ms):                        7.96
-P99 TPOT (ms):                           8.03
---------------Inter-token Latency----------------
-Mean ITL (ms):                           7.74
-Median ITL (ms):                         7.70
-P99 ITL (ms):                            8.39
-==================================================
-```
-
-### Custom Dataset
-
-If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
-
-```json
-{"prompt": "What is the capital of India?"}
-{"prompt": "What is the capital of Iran?"}
-{"prompt": "What is the capital of China?"}
-```
-
-```bash
-# start server
-VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
-```
-
-```bash
-# run benchmarking script
-vllm bench serve --port 9001 --save-result --save-detailed \
-  --backend vllm \
-  --model meta-llama/Llama-3.1-8B-Instruct \
-  --endpoint /v1/completions \
-  --dataset-name custom \
-  --dataset-path <path-to-your-data-jsonl> \
-  --custom-skip-chat-template \
-  --num-prompts 80 \
-  --max-concurrency 1 \
-  --temperature=0.3 \
-  --top-p=0.75 \
-  --result-dir "./log/"
-```
-
-You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
-
-### VisionArena Benchmark for Vision Language Models
-
-```bash
-# need a model with vision capability here
-vllm serve Qwen/Qwen2-VL-7B-Instruct
-```
-
-```bash
-vllm bench serve \
-  --backend openai-chat \
-  --endpoint-type openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path lmarena-ai/VisionArena-Chat \
-  --hf-split train \
-  --num-prompts 1000
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
-    --speculative-config $'{"method": "ngram",
-    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
-    "prompt_lookup_min": 2}'
-```
-
-``` bash
-vllm bench serve \
-    --model meta-llama/Meta-Llama-3-8B-Instruct \
-    --dataset-name hf \
-    --dataset-path likaixin/InstructCoder \
-    --num-prompts 2048
-```
-
-### Other HuggingFaceDataset Examples
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct
-```
-
-`lmms-lab/LLaVA-OneVision-Data`:
-
-```bash
-vllm bench serve \
-  --backend openai-chat \
-  --endpoint-type openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path lmms-lab/LLaVA-OneVision-Data \
-  --hf-split train \
-  --hf-subset "chart2text(cauldron)" \
-  --num-prompts 10
-```
-
-`Aeala/ShareGPT_Vicuna_unfiltered`:
-
-```bash
-vllm bench serve \
-  --backend openai-chat \
-  --endpoint-type openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
-  --hf-split train \
-  --num-prompts 10
-```
-
-`AI-MO/aimo-validation-aime`:
-
-``` bash
-vllm bench serve \
-    --model Qwen/QwQ-32B \
-    --dataset-name hf \
-    --dataset-path AI-MO/aimo-validation-aime \
-    --num-prompts 10 \
-    --seed 42
-```
-
-`philschmid/mt-bench`:
-
-``` bash
-vllm bench serve \
-    --model Qwen/QwQ-32B \
-    --dataset-name hf \
-    --dataset-path philschmid/mt-bench \
-    --num-prompts 80
-```
-
-### Running With Sampling Parameters
-
-When using OpenAI-compatible backends such as `vllm`, optional sampling
-parameters can be specified. Example client command:
-
-```bash
-vllm bench serve \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --endpoint /v1/completions \
-  --dataset-name sharegpt \
-  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --top-k 10 \
-  --top-p 0.9 \
-  --temperature 0.5 \
-  --num-prompts 10
-```
-
-### Running With Ramp-Up Request Rate
-
-The benchmark tool also supports ramping up the request rate over the
-duration of the benchmark run. This can be useful for stress testing the
-server or finding the maximum throughput that it can handle, given some latency budget.
-
-Two ramp-up strategies are supported:
-
- `linear`: Increases the request rate linearly from a start value to an end value.
- `exponential`: Increases the request rate exponentially.
-
-The following arguments can be used to control the ramp-up:
-
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
-
-</details>
-
-## 📈 Example - Offline Throughput Benchmark
-
-<details>
-<summary>Show more</summary>
-
-<br/>
-
-```bash
-vllm bench throughput \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --dataset-name sonnet \
-  --dataset-path vllm/benchmarks/sonnet.txt \
-  --num-prompts 10
-```
-
-If successful, you will see the following output
-
-```text
-Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
-Total num prompt tokens:  5014
-Total num output tokens:  1500
-```
-
-### VisionArena Benchmark for Vision Language Models
-
-```bash
-vllm bench throughput \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path lmarena-ai/VisionArena-Chat \
-  --num-prompts 1000 \
-  --hf-split train
-```
-
-The `num prompt tokens` now includes image token counts
-
-```text
-Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
-Total num prompt tokens:  14527
-Total num output tokens:  1280
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_WORKER_MULTIPROC_METHOD=spawn \
-VLLM_USE_V1=1 \
-vllm bench throughput \
-    --dataset-name=hf \
-    --dataset-path=likaixin/InstructCoder \
-    --model=meta-llama/Meta-Llama-3-8B-Instruct \
-    --input-len=1000 \
-    --output-len=100 \
-    --num-prompts=2048 \
-    --async-engine \
-    --speculative-config $'{"method": "ngram",
-    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
-    "prompt_lookup_min": 2}'
-```
-
-```text
-Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
-Total num prompt tokens:  261136
-Total num output tokens:  204800
-```
-
-### Other HuggingFaceDataset Examples
-
-`lmms-lab/LLaVA-OneVision-Data`:
-
-```bash
-vllm bench throughput \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path lmms-lab/LLaVA-OneVision-Data \
-  --hf-split train \
-  --hf-subset "chart2text(cauldron)" \
-  --num-prompts 10
-```
-
-`Aeala/ShareGPT_Vicuna_unfiltered`:
-
-```bash
-vllm bench throughput \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
-  --hf-split train \
-  --num-prompts 10
-```
-
-`AI-MO/aimo-validation-aime`:
-
-```bash
-vllm bench throughput \
-  --model Qwen/QwQ-32B \
-  --backend vllm \
-  --dataset-name hf \
-  --dataset-path AI-MO/aimo-validation-aime \
-  --hf-split train \
-  --num-prompts 10
-```
-
-Benchmark with LoRA adapters:
-
-``` bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-vllm bench throughput \
-  --model meta-llama/Llama-2-7b-hf \
-  --backend vllm \
-  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --dataset_name sharegpt \
-  --num-prompts 10 \
-  --max-loras 2 \
-  --max-lora-rank 8 \
-  --enable-lora \
-  --lora-path yard1/llama-2-7b-sql-lora-test
-  ```
-
-</details>
-
-## 🛠️ Example - Structured Output Benchmark
-
-<details>
-<summary>Show more</summary>
-
-<br/>
-
-Benchmark the performance of structured output generation (JSON, grammar, regex).
-
-### Server Setup
-
-```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B
-```
-
-### JSON Schema Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --dataset json \
-  --structured-output-ratio 1.0 \
-  --request-rate 10 \
-  --num-prompts 1000
-```
-
-### Grammar-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --dataset grammar \
-  --structure-type grammar \
-  --request-rate 10 \
-  --num-prompts 1000
-```
-
-### Regex-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --dataset regex \
-  --request-rate 10 \
-  --num-prompts 1000
-```
-
-### Choice-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --dataset choice \
-  --request-rate 10 \
-  --num-prompts 1000
-```
-
-### XGrammar Benchmark Dataset
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --dataset xgrammar_bench \
-  --request-rate 10 \
-  --num-prompts 1000
-```
-
-</details>
-
-## 📚 Example - Long Document QA Benchmark
-
-<details>
-<summary>Show more</summary>
-
-<br/>
-
-Benchmark the performance of long document question-answering with prefix caching.
-
-### Basic Long Document QA Test
-
-```bash
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
-  --model meta-llama/Llama-2-7b-chat-hf \
-  --enable-prefix-caching \
-  --num-documents 16 \
-  --document-length 2000 \
-  --output-len 50 \
-  --repeat-count 5
-```
-
-### Different Repeat Modes
-
-```bash
-# Random mode (default) - shuffle prompts randomly
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
-  --model meta-llama/Llama-2-7b-chat-hf \
-  --enable-prefix-caching \
-  --num-documents 8 \
-  --document-length 3000 \
-  --repeat-count 3 \
-  --repeat-mode random
-
-# Tile mode - repeat entire prompt list in sequence
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
-  --model meta-llama/Llama-2-7b-chat-hf \
-  --enable-prefix-caching \
-  --num-documents 8 \
-  --document-length 3000 \
-  --repeat-count 3 \
-  --repeat-mode tile
-
-# Interleave mode - repeat each prompt consecutively
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
-  --model meta-llama/Llama-2-7b-chat-hf \
-  --enable-prefix-caching \
-  --num-documents 8 \
-  --document-length 3000 \
-  --repeat-count 3 \
-  --repeat-mode interleave
-```
-
-</details>
-
-## 🗂️ Example - Prefix Caching Benchmark
-
-<details>
-<summary>Show more</summary>
-
-<br/>
-
-Benchmark the efficiency of automatic prefix caching.
-
-### Fixed Prompt with Prefix Caching
-
-```bash
-python3 benchmarks/benchmark_prefix_caching.py \
-  --model meta-llama/Llama-2-7b-chat-hf \
-  --enable-prefix-caching \
-  --num-prompts 1 \
-  --repeat-count 100 \
-  --input-length-range 128:256
-```
-
-### ShareGPT Dataset with Prefix Caching
-
-```bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-python3 benchmarks/benchmark_prefix_caching.py \
-  --model meta-llama/Llama-2-7b-chat-hf \
-  --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --enable-prefix-caching \
-  --num-prompts 20 \
-  --repeat-count 5 \
-  --input-length-range 128:256
-```
-
-### Prefix Repetition Dataset
-
-```bash
-vllm bench serve \
-  --backend openai \
-  --model meta-llama/Llama-2-7b-chat-hf \
-  --dataset-name prefix_repetition \
-  --num-prompts 100 \
-  --prefix-repetition-prefix-len 512 \
-  --prefix-repetition-suffix-len 128 \
-  --prefix-repetition-num-prefixes 5 \
-  --prefix-repetition-output-len 128
-```
-
-</details>
-
-## ⚡ Example - Request Prioritization Benchmark
-
-<details>
-<summary>Show more</summary>
-
-<br/>
-
-Benchmark the performance of request prioritization in vLLM.
-
-### Basic Prioritization Test
-
-```bash
-python3 benchmarks/benchmark_prioritization.py \
-  --model meta-llama/Llama-2-7b-chat-hf \
-  --input-len 128 \
-  --output-len 64 \
-  --num-prompts 100 \
-  --scheduling-policy priority
-```
-
-### Multiple Sequences per Prompt
-
-```bash
-python3 benchmarks/benchmark_prioritization.py \
-  --model meta-llama/Llama-2-7b-chat-hf \
-  --input-len 128 \
-  --output-len 64 \
-  --num-prompts 100 \
-  --scheduling-policy priority \
-  --n 2
-```
-
-</details>
-
-## 👁️ Example - Multi-Modal Benchmark
-
-<details>
-<summary>Show more</summary>
-
-<br/>
-
-Benchmark the performance of multi-modal requests in vLLM.
-
-### Images (ShareGPT4V)
-
-Start vLLM:
-
-```bash
-python -m vllm.entrypoints.openai.api_server \
-  --model Qwen/Qwen2.5-VL-7B-Instruct \
-  --dtype bfloat16 \
-  --limit-mm-per-prompt '{"image": 1}' \
-  --allowed-local-media-path /path/to/sharegpt4v/images
-```
-
-Send requests with images:
-
-```bash
-vllm bench serve \
-  --backend openai-chat \
-  --model Qwen/Qwen2.5-VL-7B-Instruct \
-  --dataset-name sharegpt \
-  --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
-  --num-prompts 100 \
-  --save-result \
-  --result-dir ~/vllm_benchmark_results \
-  --save-detailed \
-  --endpoint /v1/chat/completion
-```
-
-### Videos (ShareGPT4Video)
-
-Start vLLM:
-
-```bash
-python -m vllm.entrypoints.openai.api_server \
-  --model Qwen/Qwen2.5-VL-7B-Instruct \
-  --dtype bfloat16 \
-  --limit-mm-per-prompt '{"video": 1}' \
-  --allowed-local-media-path /path/to/sharegpt4video/videos
-```
-
-Send requests with videos:
-
-```bash
-vllm bench serve \
-  --backend openai-chat \
-  --model Qwen/Qwen2.5-VL-7B-Instruct \
-  --dataset-name sharegpt \
-  --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
-  --num-prompts 100 \
-  --save-result \
-  --result-dir ~/vllm_benchmark_results \
-  --save-detailed \
-  --endpoint /v1/chat/completion
-```
-
-### Synthetic Random Images (random-mm)
-
-Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
-
-Notes:
-
- Works only with online benchmark via the OpenAI  backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
- Video sampling is not yet implemented.
-
-Start the server (example):
-
-```bash
-vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
-  --dtype bfloat16 \
-  --max-model-len 16384 \
-  --limit-mm-per-prompt '{"image": 3, "video": 0}' \
-  --mm-processor-kwargs max_pixels=1003520
-```
-
-Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
-
-Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
-
-```bash
-vllm bench serve \
-  --backend openai-chat \
-  --model Qwen/Qwen2.5-VL-3B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name random-mm \
-  --num-prompts 100 \
-  --max-concurrency 10 \
-  --random-prefix-len 25 \
-  --random-input-len 300 \
-  --random-output-len 40 \
-  --random-range-ratio 0.2 \
-  --random-mm-base-items-per-request 2 \
-  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
-  --random-mm-bucket-config '{(224, 224, 1): 1.0}' \
-  --request-rate inf \
-  --ignore-eos \
-  --seed 42
-```
-
-The number of items per request can be controlled by passing multiple image buckets:
-
-```bash
-  --random-mm-base-items-per-request 2 \
-  --random-mm-num-mm-items-range-ratio 0.5 \
-  --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
-  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
-```
-
-Flags specific to `random-mm`:
-
- `--random-mm-base-items-per-request`: base number of multimodal items per request.
- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
-
-Behavioral notes:
-
- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
-
-How sampling works:
-
- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
-This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
-
-</details>
+- <https://docs.vllm.ai/en/latest/cli/bench/latency.html>
+- <https://docs.vllm.ai/en/latest/cli/bench/serve.html>
+- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html>
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@ -4,7 +4,10 @@
 import torch

 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_block_fp8_matmul,
+    apply_w8a8_block_fp8_linear,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    CUTLASS_BLOCK_FP8_SUPPORTED,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton as vllm_triton
@ -29,7 +32,7 @@ DEEPSEEK_V3_SHAPES = [
 ]


-def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
    """Build runner function for w8a8 block fp8 matmul."""
    factor_for_scale = 1e-2

@ -37,37 +40,54 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
    fp8_max, fp8_min = fp8_info.max, fp8_info.min

    # Create random FP8 tensors
-    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
-    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max

-    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
-    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+    B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)

    # Create scales
    block_n, block_k = block_size[0], block_size[1]
    n_tiles = (N + block_n - 1) // block_n
    k_tiles = (K + block_k - 1) // block_k

-    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
    Bs = (
        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
        * factor_for_scale
    )

+    # SM90 CUTLASS requires row-major format for scales
+    if use_cutlass and current_platform.is_device_capability(90):
+        Bs = Bs.T.contiguous()
+
    def run():
-        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+        if use_cutlass:
+            return apply_w8a8_block_fp8_linear(
+                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
+            )
+        else:
+            return apply_w8a8_block_fp8_linear(
+                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
+            )

    return run


+# Determine available providers
+available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
+plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
+
+if CUTLASS_BLOCK_FP8_SUPPORTED:
+    available_providers.append("w8a8-block-fp8-cutlass")
+
+
@vllm_triton.testing.perf_report(
    vllm_triton.testing.Benchmark(
        x_names=["batch_size"],
        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
        x_log=False,
        line_arg="provider",
-        line_vals=["torch-bf16", "w8a8-block-fp8"],
-        line_names=["torch-bf16", "w8a8-block-fp8"],
+        line_vals=available_providers,
+        line_names=available_providers,
        ylabel="TFLOP/s (larger is better)",
        plot_name="BF16 vs W8A8 Block FP8 GEMMs",
        args={},
@ -85,11 +105,22 @@ def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
        )
-    else:  # w8a8-block-fp8
-        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
-        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
-            lambda: run_w8a8(), quantiles=quantiles
+    elif provider == "w8a8-block-fp8-triton":
+        run_w8a8_triton = build_w8a8_block_fp8_runner(
+            M, N, K, block_size, device, use_cutlass=False
        )
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8_triton(), quantiles=quantiles
+        )
+    elif provider == "w8a8-block-fp8-cutlass":
+        run_w8a8_cutlass = build_w8a8_block_fp8_runner(
+            M, N, K, block_size, device, use_cutlass=True
+        )
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8_cutlass(), quantiles=quantiles
+        )
+    else:
+        raise ValueError(f"Unknown provider: {provider}")

    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -2,14 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 from typing import Callable
+from unittest.mock import patch

+import pandas as pd
 import torch

-from vllm import _custom_ops as ops
-from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+
+def with_triton_mode(fn):
+    """Temporarily force the Triton fallback path"""
+
+    def wrapped(*args, **kwargs):
+        with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+            return fn(*args, **kwargs)
+
+    return wrapped


 # TODO(luka): use standalone_compile utility
@ -21,78 +32,236 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
    return inner


-torch._dynamo.config.recompile_limit = 8888
-compilation_config = CompilationConfig(custom_ops=["none"])
-with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
-    torch_per_token_quant_fp8 = torch.compile(
-        QuantFP8(False, GroupShape.PER_TOKEN),
-        fullgraph=True,
-        dynamic=False,  # recompile for different shapes
-    )
+def bench_compile(fn: Callable):
+    # recompile for different shapes
+    fwd = torch.compile(fn, fullgraph=True, dynamic=False)

    # First dim is explicitly dynamic to simulate vLLM usage
-    torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
+    return with_dyn_arg(fwd, 0, 0)


-def cuda_per_token_quant_fp8(
-    input: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    return ops.scaled_fp8_quant(input)
+torch._dynamo.config.recompile_limit = 8888


-def calculate_diff(batch_size: int, seq_len: int):
-    """Calculate difference between Triton and CUDA implementations."""
+def calculate_diff(
+    batch_size: int,
+    hidden_size: int,
+    group_shape: GroupShape,
+    dtype: torch.dtype,
+):
+    """Calculate the difference between Inductor and CUDA implementations."""
    device = torch.device("cuda")
-    x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
+    x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)

-    torch_out, torch_scale = torch_per_token_quant_fp8(x)
-    cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
+    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)

-    if torch.allclose(
-        cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
-    ) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
+    torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
+    torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
+    cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
+
+    out_allclose = lambda o1, o2: torch.allclose(
+        o1.to(torch.float32),
+        o2.to(torch.float32),
+        rtol=1e-3,
+        atol=1e-5,
+    )
+    scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
+
+    if (
+        out_allclose(cuda_out, torch_out)
+        and scale_allclose(cuda_scale, torch_scale)
+        and out_allclose(cuda_out, torch_eager_out)
+        and scale_allclose(cuda_scale, torch_eager_scale)
+    ):
        print("✅ All implementations match")
    else:
        print("❌ Implementations differ")


-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
-
-configs = list(itertools.product(batch_size_range, seq_len_range))
+configs = []


-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["batch_size", "seq_len"],
-        x_vals=configs,
-        line_arg="provider",
-        line_vals=["torch", "cuda"],
-        line_names=["Torch", "CUDA"],
-        styles=[("blue", "-"), ("green", "-")],
-        ylabel="us",
-        plot_name="per-token-dynamic-quant-fp8-performance",
-        args={},
-    )
-)
-def benchmark_quantization(batch_size, seq_len, provider):
-    dtype = torch.float16
+def benchmark_quantization(
+    batch_size,
+    hidden_size,
+    provider,
+    group_shape: GroupShape,
+    col_major: bool,
+    dtype: torch.dtype,
+):
    device = torch.device("cuda")

-    x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
+    x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)

    quantiles = [0.5, 0.2, 0.8]
+    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)

    if provider == "torch":
-        fn = lambda: torch_per_token_quant_fp8(x.clone())
+        fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
    elif provider == "cuda":
-        fn = lambda: cuda_per_token_quant_fp8(x.clone())
+        fn = lambda: quant_fp8.forward_cuda(x.clone())
+    elif provider == "triton":
+        if not group_shape.is_per_group():
+            # Triton only supported for per-group
+            return 0, 0, 0
+
+        fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())

    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)

    return 1000 * ms, 1000 * max_ms, 1000 * min_ms


+# TODO(luka) extract to utils
+def compute_geomean_speedups(
+    df: pd.DataFrame,
+    baseline_col: str,
+    speedup_cols: list[str],
+    groupby_cols: list[str] | None = None,
+) -> pd.DataFrame:
+    """
+    Compute geometric mean speedups over a baseline column.
+
+    Args:
+        df: Input dataframe
+        baseline_col: Column to use as baseline
+        speedup_cols: Columns to compute speedups for
+        groupby_cols: Columns to group by. If None, compute over entire df.
+
+    Returns:
+        pd.DataFrame with geometric mean speedups
+    """
+    from scipy.stats import gmean
+
+    def geo_speedup(group: pd.DataFrame) -> pd.Series:
+        ratios = {
+            col: (group[baseline_col] / group[col]).values for col in speedup_cols
+        }
+        return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
+
+    if groupby_cols is None:
+        result = geo_speedup(df).to_frame().T
+    else:
+        result = (
+            df.groupby(groupby_cols)
+            .apply(geo_speedup, include_groups=False)
+            .reset_index()
+        )
+
+    return result
+
+
 if __name__ == "__main__":
-    calculate_diff(batch_size=4, seq_len=4096)
-    benchmark_quantization.run(print_data=True)
+    parser = FlexibleArgumentParser(
+        description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
+    )
+    parser.add_argument("-c", "--check", action="store_true")
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
+    )
+    parser.add_argument(
+        "--hidden-sizes",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
+    )
+    parser.add_argument(
+        "--batch-sizes",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Batch sizes to benchmark (default: 1,16,32,64,128)",
+    )
+    parser.add_argument(
+        "--group-sizes",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Group sizes for GroupShape(1,N) to benchmark. "
+        "Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
+    )
+    parser.add_argument(
+        "--no-column-major",
+        action="store_true",
+        help="Disable column-major scales testing",
+    )
+
+    args = parser.parse_args()
+    assert args
+
+    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+    hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+    batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
+
+    if args.group_sizes is not None:
+        group_shapes = []
+        for size in args.group_sizes:
+            if size == 0:
+                group_shapes.append(GroupShape.PER_TENSOR)
+            elif size == -1:
+                group_shapes.append(GroupShape.PER_TOKEN)
+            else:
+                group_shapes.append(GroupShape(1, size))
+    else:
+        group_shapes = [
+            GroupShape.PER_TENSOR,
+            GroupShape.PER_TOKEN,
+            GroupShape(1, 64),
+            GroupShape(1, 128),
+        ]
+
+    column_major_scales = [False] if args.no_column_major else [True, False]
+
+    config_gen = itertools.product(
+        group_shapes,
+        column_major_scales,
+        batch_sizes,
+        hidden_sizes,
+    )
+
+    # filter out column-major scales for non-group, reverse order
+    configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
+
+    print(f"Running {len(configs)} configurations:")
+    print(f"  Hidden sizes: {hidden_sizes}")
+    print(f"  Batch sizes: {batch_sizes}")
+    print(f"  Group shapes: {[str(g) for g in group_shapes]}")
+    print(f"  Column major scales: {column_major_scales}")
+    print()
+
+    if args.check:
+        for group_shape in group_shapes:
+            group_size = group_shape[1]
+            print(f"{group_size=}")
+            calculate_diff(
+                batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
+            )
+
+    benchmark = triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
+            x_vals=configs,
+            line_arg="provider",
+            line_vals=["torch", "cuda", "triton"],
+            line_names=["Torch (Compiled)", "CUDA", "Triton"],
+            styles=[("blue", "-"), ("green", "-"), ("black", "-")],
+            ylabel="us",
+            plot_name="QuantFP8 performance",
+            args={},
+        )
+    )(benchmark_quantization)
+
+    df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
+
+    # Print geomean speedups
+    geo_table_grouped = compute_geomean_speedups(
+        df,
+        baseline_col="Torch (Compiled)",
+        speedup_cols=["CUDA", "Triton"],
+        groupby_cols=["col_major", "group_shape"],
+    )
+
+    print("Speedup over Torch (Compiled)")
+    print(geo_table_grouped.to_string(index=False))
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -0,0 +1,486 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Benchmark script for device communicators:
+CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
+and SymmMemCommunicator (multimem, two-shot).
+
+Usage:
+    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
+
+Example:
+    torchrun --nproc_per_node=2 benchmark_device_communicators.py
+    --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
+"""
+
+import json
+import os
+import time
+from contextlib import nullcontext
+from typing import Callable, Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+# Default sequence lengths to benchmark
+DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
+
+# Fixed hidden size and dtype for all benchmarks
+HIDDEN_SIZE = 8192
+BENCHMARK_DTYPE = torch.bfloat16
+
+# CUDA graph settings
+CUDA_GRAPH_CAPTURE_CYCLES = 10
+
+
+class CommunicatorBenchmark:
+    """Benchmark class for testing device communicators."""
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        device: torch.device,
+        cpu_group: ProcessGroup,
+        sequence_lengths: list[int],
+    ):
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.cpu_group = cpu_group
+
+        # Calculate max_size_override based on largest sequence length
+        max_seq_len = max(sequence_lengths)
+        max_tensor_elements = max_seq_len * HIDDEN_SIZE
+        self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
+
+        # Initialize communicators
+        self.custom_allreduce = None
+        self.pynccl_comm = None
+        self.symm_mem_comm = None
+        self.symm_mem_comm_multimem = None
+        self.symm_mem_comm_two_shot = None
+
+        self._init_communicators()
+
+    def _init_communicators(self):
+        """Initialize all available communicators."""
+        try:
+            self.custom_allreduce = CustomAllreduce(
+                group=self.cpu_group,
+                device=self.device,
+                max_size=self.max_size_override,
+            )
+            if not self.custom_allreduce.disabled:
+                logger.info("Rank %s: CustomAllreduce initialized", self.rank)
+            else:
+                logger.info("Rank %s: CustomAllreduce disabled", self.rank)
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
+            )
+            self.custom_allreduce = None
+
+        try:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group, device=self.device
+            )
+            if not self.pynccl_comm.disabled:
+                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+            else:
+                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
+                self.pynccl_comm = None
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
+            )
+            self.pynccl_comm = None
+
+        # Initialize variants for SymmMemCommunicator
+        try:
+            self.symm_mem_comm_multimem = SymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+                force_multimem=True,
+                max_size_override=self.max_size_override,
+            )
+            if not self.symm_mem_comm_multimem.disabled:
+                logger.info(
+                    "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
+                )
+            else:
+                self.symm_mem_comm_multimem = None
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
+                self.rank,
+                e,
+            )
+            self.symm_mem_comm_multimem = None
+
+        try:
+            self.symm_mem_comm_two_shot = SymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+                force_multimem=False,
+                max_size_override=self.max_size_override,
+            )
+            if not self.symm_mem_comm_two_shot.disabled:
+                logger.info(
+                    "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
+                )
+            else:
+                self.symm_mem_comm_two_shot = None
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
+                self.rank,
+                e,
+            )
+            self.symm_mem_comm_two_shot = None
+
+    def benchmark_allreduce(
+        self, sequence_length: int, num_warmup: int, num_trials: int
+    ) -> dict[str, float]:
+        """Benchmark allreduce operations for all available communicators."""
+
+        results = {}
+
+        # Define communicators with their benchmark functions
+        communicators = []
+
+        if self.custom_allreduce is not None:
+            comm = self.custom_allreduce
+            # CustomAllreduce one-shot
+            communicators.append(
+                (
+                    "ca_1stage",
+                    lambda t, c=comm: c.custom_all_reduce(t),
+                    lambda t, c=comm: c.should_custom_ar(t),
+                    comm.capture(),
+                    "1stage",  # env variable value
+                )
+            )
+            # CustomAllreduce two-shot
+            communicators.append(
+                (
+                    "ca_2stage",
+                    lambda t, c=comm: c.custom_all_reduce(t),
+                    lambda t, c=comm: c.should_custom_ar(t),
+                    comm.capture(),
+                    "2stage",  # env variable value
+                )
+            )
+
+        if self.pynccl_comm is not None:
+            comm = self.pynccl_comm
+            communicators.append(
+                (
+                    "pynccl",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t: True,  # Always available if initialized
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )
+
+        if self.symm_mem_comm_multimem is not None:
+            comm = self.symm_mem_comm_multimem
+            communicators.append(
+                (
+                    "symm_mem_multimem",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t, c=comm: c.should_use_symm_mem(t),
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )
+
+        if self.symm_mem_comm_two_shot is not None:
+            comm = self.symm_mem_comm_two_shot
+            communicators.append(
+                (
+                    "symm_mem_two_shot",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t, c=comm: c.should_use_symm_mem(t),
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )
+
+        # Benchmark each communicator
+        for name, allreduce_fn, should_use_fn, context, env_var in communicators:
+            # Set environment variable if needed
+            if env_var is not None:
+                os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
+            else:
+                # Clear the environment variable to avoid interference
+                os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
+
+            latency = self.benchmark_allreduce_single(
+                sequence_length,
+                allreduce_fn,
+                should_use_fn,
+                context,
+                num_warmup,
+                num_trials,
+            )
+            if latency is not None:
+                results[name] = latency
+
+        return results
+
+    def benchmark_allreduce_single(
+        self,
+        sequence_length: int,
+        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+        should_use_fn: Callable[[torch.Tensor], bool],
+        context,
+        num_warmup: int,
+        num_trials: int,
+    ) -> Optional[float]:
+        """Benchmark method with CUDA graph optimization."""
+        try:
+            # Create test tensor (2D: sequence_length x hidden_size)
+            tensor = torch.randn(
+                sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
+            )
+            if not should_use_fn(tensor):
+                return None
+
+            torch.cuda.synchronize()
+            stream = torch.cuda.Stream()
+            with torch.cuda.stream(stream):
+                graph_input = tensor.clone()
+
+                # Warmup before capture
+                for _ in range(3):
+                    allreduce_fn(graph_input)
+
+                # Capture the graph using context manager
+                with context:
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph):
+                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
+                            allreduce_fn(graph_input)
+
+            torch.cuda.synchronize()
+            for _ in range(num_warmup):
+                graph.replay()
+            torch.cuda.synchronize()
+
+            torch.cuda.synchronize()
+            start_time = time.perf_counter()
+
+            for _ in range(num_trials):
+                graph.replay()
+            torch.cuda.synchronize()
+
+            end_time = time.perf_counter()
+
+            # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
+            return (
+                (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
+            )
+
+        except Exception as e:
+            logger.error("CUDA graph benchmark failed: %s", e)
+            raise RuntimeError(
+                f"CUDA graph benchmark failed for communicator: {e}"
+            ) from e
+
+
+def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
+    """Calculate speedup information for a single tensor size."""
+    if not comm_results:
+        return "N/A"
+
+    # Find the fastest communicator
+    fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
+    fastest_time = comm_results[fastest_comm]
+
+    # Calculate speedup vs PyNccl if available
+    if "pynccl" in comm_results:
+        pynccl_time = comm_results["pynccl"]
+        speedup = pynccl_time / fastest_time
+        return f"{fastest_comm} ({speedup:.2f}x)"
+    else:
+        return f"{fastest_comm} (N/A)"
+
+
+def print_results(
+    results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
+):
+    """Print benchmark results in a formatted table."""
+
+    print(f"\n{'=' * 130}")
+    print("Device Communicator Benchmark Results")
+    print(
+        f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
+        f"Hidden Size: {HIDDEN_SIZE}"
+    )
+    print(f"{'=' * 130}")
+
+    # Get all communicator names
+    all_comms = set()
+    for size_results in results.values():
+        all_comms.update(size_results.keys())
+
+    all_comms = sorted(list(all_comms))
+
+    # Print header
+    header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
+    for comm in all_comms:
+        header += f"{comm:<20}"
+    header += f"{'Best (Speedup vs PyNccl)':<30}"
+    print(header)
+    print("-" * len(header))
+
+    # Print results for each sequence length
+    for seq_len in sequence_lengths:
+        if seq_len in results:
+            # Calculate tensor size in elements and bytes
+            tensor_elements = seq_len * HIDDEN_SIZE
+            tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
+
+            # Format tensor size (MB)
+            tensor_size_mb = tensor_bytes / (1024 * 1024)
+            tensor_size_str = f"{tensor_size_mb:.2f} MB"
+
+            # Format tensor shape
+            tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
+
+            row = f"{tensor_shape:<20}{tensor_size_str:<15}"
+            for comm in all_comms:
+                if comm in results[seq_len]:
+                    row += f"{results[seq_len][comm]:<20.3f}"
+                else:
+                    row += f"{'N/A':<20}"
+
+            # Calculate speedup information
+            speedup_info = _calculate_speedup_info(results[seq_len])
+            row += f"{speedup_info:<30}"
+
+            print(row)
+
+    print(f"{'=' * 130}")
+    print("All times are in milliseconds (ms) per allreduce operation")
+    print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
+
+
+def main():
+    parser = FlexibleArgumentParser(description="Benchmark device communicators")
+
+    parser.add_argument(
+        "--sequence-lengths",
+        type=int,
+        nargs="+",
+        default=DEFAULT_SEQUENCE_LENGTHS,
+        help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
+    )
+
+    parser.add_argument(
+        "--num-warmup", type=int, default=5, help="Number of warmup iterations"
+    )
+
+    parser.add_argument(
+        "--num-trials", type=int, default=50, help="Number of benchmark trials"
+    )
+
+    parser.add_argument("--output-json", type=str, help="Output results to JSON file")
+
+    args = parser.parse_args()
+
+    # Initialize distributed
+    if not dist.is_initialized():
+        dist.init_process_group(backend="gloo")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    # Set device
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+
+    # Get CPU process group
+    cpu_group = dist.new_group(backend="gloo")
+
+    # Disable USE_SYMM_MEM to avoid affecting the max_sizes
+    # in symm_mem and custom_all_reduce for benchmark
+    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+    # Initialize benchmark
+    benchmark = CommunicatorBenchmark(
+        rank, world_size, device, cpu_group, args.sequence_lengths
+    )
+
+    # Run benchmarks
+    all_results = {}
+
+    for seq_len in args.sequence_lengths:
+        if rank == 0:
+            logger.info(
+                "Benchmarking sequence length: %s (tensor shape: %s x %s)",
+                seq_len,
+                seq_len,
+                HIDDEN_SIZE,
+            )
+
+        results = benchmark.benchmark_allreduce(
+            sequence_length=seq_len,
+            num_warmup=args.num_warmup,
+            num_trials=args.num_trials,
+        )
+
+        all_results[seq_len] = results
+
+        # Synchronize between ranks
+        dist.barrier()
+
+    # Print results (only rank 0)
+    if rank == 0:
+        print_results(all_results, args.sequence_lengths, world_size)
+
+        # Save to JSON if requested
+        if args.output_json:
+            # Add speedup information to results
+            enhanced_results = {}
+            for seq_len, comm_results in all_results.items():
+                enhanced_results[seq_len] = {
+                    "timings": comm_results,
+                    "speedup_info": _calculate_speedup_info(comm_results),
+                }
+
+            output_data = {
+                "world_size": world_size,
+                "dtype": str(BENCHMARK_DTYPE),
+                "hidden_size": HIDDEN_SIZE,
+                "sequence_lengths": args.sequence_lengths,
+                "num_warmup": args.num_warmup,
+                "num_trials": args.num_trials,
+                "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
+                "results": enhanced_results,
+            }
+
+            with open(args.output_json, "w") as f:
+                json.dump(output_data, f, indent=2)
+
+            logger.info("Results saved to %s", args.output_json)
+
+    # Cleanup
+    if cpu_group != dist.group.WORLD:
+        dist.destroy_process_group(cpu_group)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -464,7 +464,11 @@ class BenchmarkTensors:
        for field_name in LoRAKernelMeta.__dataclass_fields__:
            field = getattr(self.lora_kernel_meta, field_name)
            assert isinstance(field, torch.Tensor)
-            setattr(self.lora_kernel_meta, field_name, to_device(field))
+            setattr(
+                self.lora_kernel_meta,
+                field_name,
+                to_device(field) if field_name != "no_lora_flag_cpu" else field,
+            )

    def metadata(self) -> tuple[int, int, int]:
        """
@ -512,6 +516,7 @@ class BenchmarkTensors:
            "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
            "lora_ids": self.lora_kernel_meta.active_lora_ids,
            "scaling": 1.0,
+            "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
        }

    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
@ -552,6 +557,7 @@ class BenchmarkTensors:
            "lora_ids": self.lora_kernel_meta.active_lora_ids,
            "offset_start": 0,
            "add_inputs": add_inputs,
+            "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
        }

    def bench_fn_kwargs(
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -560,7 +560,7 @@ def save_configs(
    filename = os.path.join(save_dir, filename)
    print(f"Writing best config to {filename}...")
    with open(filename, "w") as f:
-        json.dump(configs, f, indent=4)
+        json.dump({"triton_version": triton.__version__, **configs}, f, indent=4)
        f.write("\n")


@ -594,7 +594,11 @@ def main(args: argparse.Namespace):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
-    elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
+    elif config.architectures[0] in (
+        "Qwen2MoeForCausalLM",
+        "Qwen3MoeForCausalLM",
+        "Qwen3NextForCausalLM",
+    ):
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
--- a/benchmarks/kernels/benchmark_polynorm.py
+++ b/benchmarks/kernels/benchmark_polynorm.py
@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+
+import torch
+
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+
+
+def polynorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+
+    def norm(x, eps: float):
+        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+
+    x = x.float()
+    return (
+        (
+            weight[0] * norm(x**3, eps)
+            + weight[1] * norm(x**2, eps)
+            + weight[2] * norm(x, eps)
+            + bias
+        )
+        .to(weight.dtype)
+        .view(orig_shape)
+    )
+
+
+def polynorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+
+    out = torch.empty_like(x)
+    vllm_ops.poly_norm(out, x, weight, bias, eps)
+    output = out
+
+    output = output.view(orig_shape)
+    return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_dim):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+    weight = torch.ones(3, dtype=dtype, device="cuda")
+    bias = torch.ones(1, dtype=dtype, device="cuda")
+
+    output_naive = polynorm_naive(x, weight, bias)
+    output_vllm = polynorm_vllm(x, weight, bias)
+
+    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+dim_range = [2048, 4096]
+configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark():
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["dim", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["naive", "vllm"],
+            line_names=["Naive", "vLLM"],
+            styles=[("blue", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name="polynorm-perf",
+            args={},
+        )
+    )
+    def benchmark(dim, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_dim = dim * 4
+
+        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+        weight = torch.ones(3, dtype=dtype, device="cuda")
+        bias = torch.ones(1, dtype=dtype, device="cuda")
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "naive":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: polynorm_naive(x, weight, bias),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: polynorm_vllm(x, weight, bias),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length",
+    )
+    parser.add_argument(
+        "--hidden-dim",
+        type=int,
+        default=8192,
+        help="Intermediate size of MLP",
+    )
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/polnorm/",
+        help="Path to save polnorm benchmark results",
+    )
+
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        hidden_dim=args.hidden_dim,
+    )
+
+    benchmark = get_benchmark()
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@ -1,77 +1,675 @@
-#!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
+from collections.abc import Callable

+import matplotlib.pyplot as plt
+import numpy as np
 import torch

 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    silu_mul_fp8_quant_deep_gemm,
+    silu_mul_fp8_quant_deep_gemm_cuda,
 )
 from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used


-def benchmark(E, T, H, G=128, runs=50):
-    current_platform.seed_everything(42)
-    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
-    tokens_per_expert = torch.randint(
-        T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
+@triton.jit
+def _silu_mul_fp8_quant_deep_gemm(
+    # Pointers ------------------------------------------------------------
+    input_ptr,  # 16-bit activations (E, T, 2*H)
+    y_q_ptr,  # fp8 quantized activations (E, T, H)
+    y_s_ptr,  # 16-bit scales (E, T, G)
+    counts_ptr,  # int32 num tokens per expert (E)
+    # Sizes ---------------------------------------------------------------
+    H: tl.constexpr,  # hidden dimension (per output)
+    GROUP_SIZE: tl.constexpr,  # elements per group (usually 128)
+    # Strides for input (elements) ---------------------------------------
+    stride_i_e,
+    stride_i_t,
+    stride_i_h,
+    # Strides for y_q (elements) -----------------------------------------
+    stride_yq_e,
+    stride_yq_t,
+    stride_yq_h,
+    # Strides for y_s (elements) -----------------------------------------
+    stride_ys_e,
+    stride_ys_t,
+    stride_ys_g,
+    # Stride for counts (elements)
+    stride_counts_e,
+    # Numeric params ------------------------------------------------------
+    eps: tl.constexpr,
+    fp8_min: tl.constexpr,
+    fp8_max: tl.constexpr,
+    use_ue8m0: tl.constexpr,
+    # Meta ---------------------------------------------------------------
+    BLOCK: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+):
+    G = H // GROUP_SIZE
+
+    # map program id -> (e, g)
+    pid = tl.program_id(0)
+    e = pid // G
+    g = pid % G
+
+    e = e.to(tl.int64)
+    g = g.to(tl.int64)
+
+    # number of valid tokens for this expert
+    n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
+
+    cols = tl.arange(0, BLOCK).to(tl.int64)
+    mask = cols < BLOCK
+
+    base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
+    base_gate_offset = base_input_offset + cols * stride_i_h
+    base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
+    base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
+    base_ys_offset = e * stride_ys_e + g * stride_ys_g
+
+    for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
+        gate = tl.load(
+            input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
+        ).to(tl.float32)
+        up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
+
+        gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
+        y = gate * up
+
+        y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
+        if use_ue8m0:
+            y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
+
+        y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+        tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
+        tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
+
+
+def silu_mul_fp8_quant_deep_gemm_triton(
+    y: torch.Tensor,  # (E, T, 2*H)
+    tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
+    num_parallel_tokens,
+    group_size: int = 128,
+    eps: float = 1e-10,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
+
+    y has shape (E, T, 2*H). The first half of the last dimension is
+    silu-activated, multiplied by the second half, then quantized into FP8.
+
+    Returns `(y_q, y_s)` where
+    * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
+    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+    """
+    assert y.ndim == 3, "y must be (E, T, 2*H)"
+    E, T, H2 = y.shape
+    assert H2 % 2 == 0, "last dim of y must be even (2*H)"
+    H = H2 // 2
+    G = (H + group_size - 1) // group_size
+    assert H % group_size == 0, "H must be divisible by group_size"
+    assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, (
+        "tokens_per_expert must be shape (E,)"
+    )
+    tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
+
+    # allocate outputs
+    fp8_dtype = torch.float8_e4m3fn
+    y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
+
+    # strides (elements)
+    stride_i_e, stride_i_t, stride_i_h = y.stride()
+    stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
+
+    # desired scale strides (elements): (T*G, 1, T)
+    stride_ys_e = T * G
+    stride_ys_t = 1
+    stride_ys_g = T
+    y_s = torch.empty_strided(
+        (E, T, G),
+        (stride_ys_e, stride_ys_t, stride_ys_g),
+        dtype=torch.float32,
+        device=y.device,
    )

+    stride_cnt_e = tokens_per_expert.stride()[0]
+
+    # Static grid over experts and H-groups.
+    # A loop inside the kernel handles the token dim
+    grid = (E * G,)
+
+    f_info = torch.finfo(fp8_dtype)
+    fp8_max = f_info.max
+    fp8_min = f_info.min
+
+    _silu_mul_fp8_quant_deep_gemm[grid](
+        y,
+        y_q,
+        y_s,
+        tokens_per_expert,
+        H,
+        group_size,
+        stride_i_e,
+        stride_i_t,
+        stride_i_h,
+        stride_yq_e,
+        stride_yq_t,
+        stride_yq_h,
+        stride_ys_e,
+        stride_ys_t,
+        stride_ys_g,
+        stride_cnt_e,
+        eps,
+        fp8_min,
+        fp8_max,
+        is_deep_gemm_e8m0_used(),
+        BLOCK=group_size,
+        NUM_STAGES=4,
+        num_warps=1,
+    )
+
+    return y_q, y_s
+
+
+# Parse generation strategies
+strategies = ["uniform", "max_t", "first_t"]
+
+
+def benchmark(
+    kernel: Callable,
+    E: int,
+    T: int,
+    H: int,
+    total_tokens: int,
+    num_parallel_tokens: int = 64,
+    G: int = 128,
+    runs: int = 200,
+    num_warmups: int = 20,
+    gen_strategy: str = "default",
+    iterations_per_run: int = 20,
+):
+    def generate_data(seed_offset=0):
+        """Generate input data with given seed offset"""
+        current_platform.seed_everything(42 + seed_offset)
+        y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
+
+        if gen_strategy == "uniform":
+            r = torch.rand(size=(E,), device="cuda")
+            r /= r.sum()
+            r *= total_tokens
+            tokens_per_expert = r.int()
+            tokens_per_expert = torch.minimum(
+                tokens_per_expert,
+                torch.ones((E,), device=r.device, dtype=torch.int) * T,
+            )
+        elif gen_strategy == "max_t":
+            tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
+            tokens_per_expert.fill_(total_tokens / E)
+        elif gen_strategy == "first_t":
+            tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda")
+            tokens_per_expert[0] = min(T, total_tokens)
+        else:
+            raise ValueError(f"Unknown generation strategy: {gen_strategy}")
+        return y, tokens_per_expert
+
+    dataset_count = 4
+    # Pre-generate different input matrices for each iteration to avoid cache effects
+    data_sets = [generate_data(i) for i in range(dataset_count)]
+
    # Warmup
-    for _ in range(10):
-        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
-        torch.cuda.synchronize()
+    y, tokens_per_expert = data_sets[0]
+    for _ in range(num_warmups):
+        kernel(
+            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
+        )
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)

    # Benchmark
-    torch.cuda.synchronize()
-    start = time.perf_counter()
+    latencies: list[float] = []
    for _ in range(runs):
-        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
-    torch.cuda.synchronize()
+        torch.cuda.synchronize()

-    avg_time = (time.perf_counter() - start) / runs * 1000
+        start_event.record()
+        for i in range(iterations_per_run):
+            y, tokens_per_expert = data_sets[i % dataset_count]
+            kernel(
+                y,
+                tokens_per_expert,
+                num_parallel_tokens=num_parallel_tokens,
+                group_size=G,
+            )
+        end_event.record()
+        end_event.synchronize()

-    # Calculate actual work done (only count valid tokens)
+        total_time_ms = start_event.elapsed_time(end_event)
+        per_iter_time_ms = total_time_ms / iterations_per_run
+        latencies.append(per_iter_time_ms)
+
+    # Use median instead of average for better outlier handling
+    median_time_ms = np.median(latencies)
+    median_time_s = median_time_ms / 1000
+
+    # Calculate actual work done (using first dataset for consistency)
+    _, tokens_per_expert = data_sets[0]
    actual_tokens = tokens_per_expert.sum().item()
    actual_elements = actual_tokens * H

    # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
    ops_per_element = 8
    total_ops = actual_elements * ops_per_element
-    gflops = total_ops / (avg_time / 1000) / 1e9
+    gflops = total_ops / median_time_s / 1e9

    # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
    input_bytes = actual_tokens * 2 * H * 2  # 2*H bfloat16 inputs
    output_bytes = actual_tokens * H * 1  # H fp8 outputs
    scale_bytes = actual_tokens * (H // G) * 4  # scales in float32
    total_bytes = input_bytes + output_bytes + scale_bytes
-    memory_bw = total_bytes / (avg_time / 1000) / 1e9
+    memory_bw = total_bytes / median_time_s / 1e9

-    return avg_time, gflops, memory_bw
+    HOPPER_BANDWIDTH_TBPS = 3.35
+    return (
+        median_time_ms,
+        gflops,
+        memory_bw,
+        (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100,
+    )


+def create_comparison_plot(
+    ratio, cuda_times, baseline_times, config_labels, strategy_name, id
+):
+    """Create a comparison plot for a specific generation strategy"""
+    fig, ax = plt.subplots(1, 1, figsize=(16, 6))
+
+    # Configure x-axis positions
+    x = np.arange(len(config_labels))
+    width = 0.35
+
+    # Execution Time plot (lower is better)
+    ax.bar(
+        x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
+    )
+    ax.bar(
+        x + width / 2,
+        baseline_times,
+        width,
+        label="Baseline",
+        alpha=0.8,
+        color="orange",
+    )
+
+    # Add speedup labels over each bar pair
+    for i in range(len(x)):
+        speedup = ratio[i]
+        max_height = max(cuda_times[i], baseline_times[i])
+        ax.text(
+            x[i],
+            max_height + max_height * 0.02,
+            f"{speedup:.2f}x",
+            ha="center",
+            va="bottom",
+            fontweight="bold",
+            fontsize=9,
+        )
+
+    ax.set_xlabel("Configuration")
+    ax.set_ylabel("% Utilization")
+    ax.set_title(
+        f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+    )
+    ax.set_xticks(x)
+    ax.set_xticklabels(config_labels, rotation=45, ha="right")
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    return fig, ax
+
+
+def create_combined_plot(all_results):
+    """Create a combined plot with all strategies in one PNG"""
+    num_strategies = len(all_results)
+    fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
+
+    if num_strategies == 1:
+        axes = [axes]
+
+    for idx, (
+        strategy_name,
+        ratio,
+        cuda_times,
+        baseline_times,
+        config_labels,
+    ) in enumerate(all_results):
+        ax = axes[idx]
+
+        # Configure x-axis positions
+        x = np.arange(len(config_labels))
+        width = 0.35
+
+        # Execution Time plot (lower is better)
+        ax.bar(
+            x - width / 2,
+            cuda_times,
+            width,
+            label="CUDA Kernel",
+            alpha=0.8,
+            color="blue",
+        )
+        ax.bar(
+            x + width / 2,
+            baseline_times,
+            width,
+            label="Baseline",
+            alpha=0.8,
+            color="orange",
+        )
+
+        # Add speedup labels over each bar pair
+        for i in range(len(x)):
+            speedup = ratio[i]
+            max_height = max(cuda_times[i], baseline_times[i])
+            ax.text(
+                x[i],
+                max_height + max_height * 0.02,
+                f"{speedup:.2f}x",
+                ha="center",
+                va="bottom",
+                fontweight="bold",
+                fontsize=9,
+            )
+
+        ax.set_xlabel("Configuration")
+        ax.set_ylabel("% Utilization")
+        ax.set_title(
+            f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+        )
+        ax.set_xticks(x)
+        ax.set_xticklabels(config_labels, rotation=45, ha="right")
+        ax.legend()
+        ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    filename = "../../silu_bench/silu_benchmark_combined.png"
+    plt.savefig(filename, dpi=300, bbox_inches="tight")
+    plt.show()
+
+    return filename
+
+
+outer_dim = 7168
 configs = [
-    (8, 32, 1024),
-    (16, 64, 2048),
-    (32, 128, 4096),
    # DeepSeekV3 Configs
-    (256, 16, 7168),
-    (256, 32, 7168),
-    (256, 64, 7168),
-    (256, 128, 7168),
-    (256, 256, 7168),
-    (256, 512, 7168),
+    (8, 1024, 7168),
+    # DeepSeekV3 Configs
+    (32, 1024, 7168),
+    # DeepSeekV3 Configs
    (256, 1024, 7168),
 ]

-print(f"GPU: {torch.cuda.get_device_name()}")
-print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
-print("-" * 50)
+runs = 100
+num_warmups = 20

-for E, T, H in configs:
-    try:
-        time_ms, gflops, gbps = benchmark(E, T, H)
-        print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
-    except Exception:
-        print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
+strategy_descriptions = {
+    "uniform": "Uniform Random",
+    "max_t": "Even Assignment",
+    "first_t": "experts[0] = T, experts[1:] = 0",
+}
+
+print(f"GPU: {torch.cuda.get_device_name()}")
+print(f"Testing strategies: {', '.join(strategies)}")
+print(f"Configurations: {len(configs)} configs")
+
+all_results = []
+
+# Run benchmarks for each strategy
+for id, strategy in enumerate(strategies):
+    print(f"\n{'=' * 60}")
+    print(f"Testing strategy: {strategy_descriptions[strategy]}")
+    print(f"{'=' * 60}")
+
+    # Collect benchmark data for both algorithms
+    config_labels = []
+    config_x_axis = []
+    all_cuda_results = []
+    all_baseline_results = []
+    all_ratios = []
+
+    for E, T, H in configs:
+        total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
+        config_x_axis.append(total_tokens_config)
+
+        cuda_results = []
+        baseline_results = []
+        ratios = []
+
+        for total_tokens in total_tokens_config:
+            config_label = f"E={E},T={T},H={H},TT={total_tokens}"
+            config_labels.append(config_label)
+
+            # CUDA kernel results
+            time_ms_cuda, gflops, gbps, perc = benchmark(
+                silu_mul_fp8_quant_deep_gemm_cuda,
+                E,
+                T,
+                H,
+                total_tokens,
+                runs=runs,
+                num_warmups=num_warmups,
+                gen_strategy=strategy,
+            )
+            cuda_results.append((time_ms_cuda, gflops, gbps, perc))
+
+            # Baseline results
+            time_ms_triton, gflops, gbps, perc = benchmark(
+                silu_mul_fp8_quant_deep_gemm_triton,
+                E,
+                T,
+                H,
+                total_tokens,
+                runs=runs,
+                num_warmups=num_warmups,
+                gen_strategy=strategy,
+            )
+            baseline_results.append((time_ms_triton, gflops, gbps, perc))
+            ratios.append(time_ms_triton / time_ms_cuda)
+
+            print(f"Completed: {config_label}")
+        all_cuda_results.append(cuda_results)
+        all_baseline_results.append(baseline_results)
+        all_ratios.append(ratios)
+
+    # Store results for combined plotting
+    all_results.append(
+        (
+            strategy_descriptions[strategy],
+            all_ratios,
+            all_cuda_results,
+            all_baseline_results,
+            config_labels,
+            config_x_axis,
+        )
+    )
+
+    # Print summary table for this strategy
+    print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
+    print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
+    print("-" * 60)
+
+    for i, (E, T, H) in enumerate(configs):
+        speedup = baseline_results[i][0] / cuda_results[i][0]
+        config_label = f"E={E:3d},T={T:4d},H={H:4d}"
+        print(
+            f"{config_label:<20} {cuda_results[i][0]:8.5f} "
+            f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
+        )
+
+
+def create_total_tokens_plot(all_results):
+    num_strategies = len(all_results)
+    num_configs = len(configs)
+
+    # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
+    fig, axs = plt.subplots(
+        num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
+    )
+
+    # Add main title to the entire figure
+    fig.suptitle(
+        "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
+        fontsize=16,
+        fontweight="bold",
+        y=0.98,
+    )
+
+    # Handle single strategy case
+    if num_strategies == 1:
+        axs = axs.reshape(1, -1)
+
+    # Handle single config case
+    if num_configs == 1:
+        axs = axs.reshape(-1, 2)
+
+    for strategy_idx, result in enumerate(all_results):
+        (
+            strategy_name,
+            all_ratios,
+            all_cuda_results,
+            all_baseline_results,
+            config_labels,
+            config_x_axis,
+        ) = result
+
+        for config_idx in range(num_configs):
+            # Speedup plot (left column)
+            ax_speedup = axs[strategy_idx, config_idx * 2]
+            # Bandwidth plot (right column)
+            ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1]
+
+            E, T, H = configs[config_idx]
+            ratios = all_ratios[config_idx]
+            total_tokens_values = config_x_axis[config_idx]
+
+            # Extract CUDA and Triton bandwidth percentages
+            cuda_bandwidth_percentages = [
+                result[3] for result in all_cuda_results[config_idx]
+            ]
+            triton_bandwidth_percentages = [
+                result[3] for result in all_baseline_results[config_idx]
+            ]
+
+            # Plot speedup ratios vs total tokens (left plot)
+            ax_speedup.plot(
+                total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
+            )
+            ax_speedup.set_title(
+                f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
+                fontsize=12,
+                fontweight="bold",
+            )
+            ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+            ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
+            ax_speedup.grid(True, alpha=0.3)
+
+            ax_bandwidth.plot(
+                total_tokens_values,
+                cuda_bandwidth_percentages,
+                "ro-",
+                linewidth=3,
+                markersize=8,
+                label="CUDA",
+            )
+            ax_bandwidth.plot(
+                total_tokens_values,
+                triton_bandwidth_percentages,
+                "go-",
+                linewidth=3,
+                markersize=8,
+                label="Triton",
+            )
+            ax_bandwidth.set_title(
+                f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
+                fontsize=12,
+                fontweight="bold",
+            )
+            ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+            ax_bandwidth.set_ylabel(
+                "% of Peak Bandwidth", fontweight="bold", fontsize=11
+            )
+            ax_bandwidth.legend(prop={"weight": "bold"})
+            ax_bandwidth.grid(True, alpha=0.3)
+
+            # Format x-axis labels for both plots
+            for ax in [ax_speedup, ax_bandwidth]:
+                ax.set_xticks(total_tokens_values)
+                ax.set_xticklabels(
+                    [
+                        f"{tt // 1000}K" if tt >= 1000 else str(tt)
+                        for tt in total_tokens_values
+                    ],
+                    fontweight="bold",
+                )
+                # Make tick labels bold
+                for label in ax.get_xticklabels() + ax.get_yticklabels():
+                    label.set_fontweight("bold")
+
+            # Add value labels on speedup points
+            for x, y in zip(total_tokens_values, ratios):
+                ax_speedup.annotate(
+                    f"{y:.2f}x",
+                    (x, y),
+                    textcoords="offset points",
+                    xytext=(0, 12),
+                    ha="center",
+                    fontsize=10,
+                    fontweight="bold",
+                    bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
+                )
+
+            # Add value labels on CUDA bandwidth points
+            for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
+                ax_bandwidth.annotate(
+                    f"{y:.1f}%",
+                    (x, y),
+                    textcoords="offset points",
+                    xytext=(0, 12),
+                    ha="center",
+                    fontsize=9,
+                    fontweight="bold",
+                    bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
+                )
+
+            # Add value labels on Triton bandwidth points
+            for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
+                ax_bandwidth.annotate(
+                    f"{y:.1f}%",
+                    (x, y),
+                    textcoords="offset points",
+                    xytext=(0, -15),
+                    ha="center",
+                    fontsize=9,
+                    fontweight="bold",
+                    bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3),
+                )
+
+    plt.tight_layout()
+    plt.subplots_adjust(top=0.93)  # Make room for main title
+    filename = "silu_benchmark_total_tokens.png"
+    plt.savefig(filename, dpi=300, bbox_inches="tight")
+    plt.show()
+
+    return filename
+
+
+# Create combined plot with all strategies
+combined_plot_filename = create_total_tokens_plot(all_results)
+
+print(f"\n{'=' * 60}")
+print("Benchmark Complete!")
+print(f"Generated combined plot: {combined_plot_filename}")
+print(f"{'=' * 60}")
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -56,7 +56,7 @@ def w8a8_block_matmul(
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization.
                    It should be 2-dim, e.g., [128, 128].
-        output_dytpe: The dtype of the returned tensor.
+        output_dtype: The dtype of the returned tensor.

    Returns:
        torch.Tensor: The result of matmul.
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@ -55,6 +55,107 @@ output_num_chunks  166.0    99.01   11.80    79.00    90.00    98.00   108.75
 ----------------------------------------------------------------------------------------------------
 ```

+### JSON configuration file for synthetic conversations generation
+
+The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
+When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
+
+The file `generate_multi_turn.json` is an example file.
+
+The file must contain the sections `prompt_input` and `prompt_output`.
+
+The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
+
+* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/>
+The final value will always be rounded to an even number so each user turn has a reply.
+* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
+* `num_tokens` - Total token length of each **user** message (one turn).
+
+The `prompt_output` section must contain `num_tokens`:
+
+* `num_tokens` - Total token length of each **assistant** message (one turn).
+
+### Random distributions for synthetic conversations generation
+
+When creating an input JSON file (such as `generate_multi_turn.json`),<br/>
+every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/>
+The distribution determines how to randomly sample values for the field.
+
+The available distributions are listed below.
+
+**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br>
+Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
+
+#### constant
+
+```json
+{
+    "distribution": "constant",
+    "value": 500
+}
+```
+
+* `value` - the fixed integer value (always returns the same number).
+
+#### uniform
+
+```json
+{
+    "distribution": "uniform",
+    "min": 12,
+    "max": 18
+}
+```
+
+* `min` - minimum value (inclusive).
+* `max` - maximum value (inclusive), should be equal or larger than min.
+
+#### lognormal
+
+```json
+{
+    "distribution": "lognormal",
+    "average": 1000,
+    "max": 5000
+}
+```
+
+You can parameterize the lognormal distribution in one of two ways:
+
+Using the average and optional median ratio:
+
+* `average` - target average value of the distribution.
+* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
+
+Using the parameters of the underlying normal distribution:
+
+* `mean` - mean of the underlying normal distribution.
+* `sigma` - standard deviation of the underlying normal distribution.
+
+#### zipf
+
+```json
+{
+    "distribution": "zipf",
+    "alpha": 1.2,
+    "max": 100
+}
+```
+
+* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
+
+#### poisson
+
+```json
+{
+    "distribution": "poisson",
+    "alpha": 10,
+    "max": 50
+}
+```
+
+* `alpha` - expected value (λ). Also the variance of the distribution.
+
 ## ShareGPT Conversations

 To run with the ShareGPT data, download the following ShareGPT dataset:
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@ -99,21 +99,105 @@ class PoissonDistribution(Distribution):

 class LognormalDistribution(Distribution):
    def __init__(
-        self, mean: float, sigma: float, max_val: Optional[int] = None
+        self,
+        mean: Optional[float] = None,
+        sigma: Optional[float] = None,
+        average: Optional[int] = None,
+        median_ratio: Optional[float] = None,
+        max_val: Optional[int] = None,
    ) -> None:
+        self.average = average
+        self.median_ratio = median_ratio
+        self.max_val = max_val
+
+        if average is not None:
+            if average < 1:
+                raise ValueError("Lognormal average must be positive")
+
+            if mean or sigma:
+                raise ValueError(
+                    "When using lognormal average, you can't provide mean/sigma"
+                )
+
+            if self.median_ratio is None:
+                # Default value that provides relatively wide range of values
+                self.median_ratio = 0.85
+
+            # Calculate mean/sigma of np.random.lognormal based on the average
+            mean, sigma = self._generate_lognormal_by_median(
+                target_average=self.average, median_ratio=self.median_ratio
+            )
+        else:
+            if mean is None or sigma is None:
+                raise ValueError(
+                    "Must provide both mean and sigma if average is not used"
+                )
+
+            if mean <= 0 or sigma < 0:
+                raise ValueError(
+                    "Lognormal mean must be positive and sigma must be non-negative"
+                )
+
+        # Mean and standard deviation of the underlying normal distribution
+        # Based on numpy.random.lognormal
        self.mean = mean
        self.sigma = sigma
-        self.max_val = max_val
+
+    @staticmethod
+    def _generate_lognormal_by_median(
+        target_average: int, median_ratio: float
+    ) -> tuple[float, float]:
+        """
+        Compute (mu, sigma) for a lognormal distribution given:
+        - a target average (mean of the distribution)
+        - a ratio of median / mean (controls skewness), assume mean > median
+
+        Background:
+        If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
+        * mean(X)   = exp(mu + sigma^2 / 2)
+        * median(X) = exp(mu)
+
+        So:
+        median / mean = exp(mu) / exp(mu + sigma^2 / 2)
+                      = exp(-sigma^2 / 2)
+
+        Rearranging:
+        sigma^2 = 2 * ln(mean / median)
+        mu      = ln(median)
+
+        This gives a unique (mu, sigma) for any valid mean and median.
+        """
+        # Check input validity: median must be smaller than mean
+        if median_ratio <= 0 or median_ratio >= 1:
+            raise ValueError("median_ratio must be in range (0, 1)")
+
+        target_median = target_average * median_ratio
+
+        # Solve sigma^2 = 2 * ln(mean / median)
+        sigma = np.sqrt(2 * np.log(target_average / target_median))
+        mu = np.log(target_median)
+
+        return mu, sigma

    def sample(self, size: int = 1) -> np.ndarray:
        samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
+
+        if self.average is not None:
+            # Scale to average
+            samples *= self.average / samples.mean()
+
        if self.max_val:
            samples = np.minimum(samples, self.max_val)

        return np.round(samples).astype(int)

    def __repr__(self) -> str:
-        return f"LognormalDistribution[{self.mean}, {self.sigma}]"
+        if self.average:
+            return (
+                f"LognormalDistribution[{self.average}, "
+                f"{self.median_ratio}, {self.max_val}]"
+            )
+        return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"


 class GenConvArgs(NamedTuple):
@ -173,10 +257,21 @@ def get_random_distribution(
        return PoissonDistribution(conf["alpha"], max_val=max_val)

    elif distribution == "lognormal":
+        max_val = conf.get("max", None)
+
+        if "average" in conf:
+            # Infer lognormal mean/sigma (numpy) from input average
+            median_ratio = conf.get("median_ratio", None)
+            return LognormalDistribution(
+                average=conf["average"], median_ratio=median_ratio, max_val=max_val
+            )
+
+        # Use mean/sigma directly (for full control over the distribution)
        verify_field_exists(conf, "mean", section, subsection)
        verify_field_exists(conf, "sigma", section, subsection)
-        max_val = conf.get("max", None)
-        return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
+        return LognormalDistribution(
+            mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
+        )

    elif distribution == "uniform":
        verify_field_exists(conf, "min", section, subsection)
--- a/benchmarks/multi_turn/generate_multi_turn.json
+++ b/benchmarks/multi_turn/generate_multi_turn.json
@ -15,9 +15,8 @@
        },
        "prefix_num_tokens": {
            "distribution": "lognormal",
-            "mean": 6,
-            "sigma": 4,
-            "max": 1500
+            "average": 1000,
+            "max": 5000
        },
        "num_tokens": {
            "distribution": "uniform",
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -480,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
  endif()

-  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)

  target_compile_options(${GPU_MOD_NAME} PRIVATE
    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
--- a/csrc/attention/mla/cutlass_mla_entry.cu
+++ b/csrc/attention/mla/cutlass_mla_entry.cu
@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <torch/all.h>
-
-#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
-void cutlass_mla_decode_sm100a(torch::Tensor const& out,
-                               torch::Tensor const& q_nope,
-                               torch::Tensor const& q_pe,
-                               torch::Tensor const& kv_c_and_k_pe_cache,
-                               torch::Tensor const& seq_lens,
-                               torch::Tensor const& page_table, double scale);
-#endif
-
-void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
-                        torch::Tensor const& q_pe,
-                        torch::Tensor const& kv_c_and_k_pe_cache,
-                        torch::Tensor const& seq_lens,
-                        torch::Tensor const& page_table, double scale) {
-#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
-  return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
-                                   seq_lens, page_table, scale);
-#endif
-  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
-}
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@ -1,225 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/cutlass.h"
-#include "cutlass/kernel_hardware_info.h"
-
-#include "cutlass_extensions/common.hpp"
-
-#include "device/sm100_mla.hpp"
-#include "kernel/sm100_mla_tile_scheduler.hpp"
-
-using namespace cute;
-using namespace cutlass::fmha::kernel;
-
-template <typename T, bool PersistenceOption = true>
-struct MlaSm100 {
-  using Element = T;
-  using ElementAcc = float;
-  using ElementOut = T;
-
-  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
-  using TileShapeH = cute::tuple_element_t<0, TileShape>;
-  using TileShapeD = cute::tuple_element_t<2, TileShape>;
-
-  // H K (D_latent D_rope) B
-  using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
-
-  using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
-  using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
-  using StrideO = StrideK;                            // H D B
-  using StrideLSE = cute::tuple<_1, int>;             // H B
-
-  using TileScheduler =
-      std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
-                         Sm100MlaIndividualTileScheduler>;
-
-  using FmhaKernel =
-      cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
-          TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
-          /*kIsCpAsync=*/true>;
-  using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
-};
-
-template <typename T>
-typename T::Fmha::Arguments args_from_options(
-    at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
-    at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
-    at::Tensor const& page_table, double scale) {
-  cutlass::KernelHardwareInfo hw_info;
-  hw_info.device_id = q_nope.device().index();
-  hw_info.sm_count =
-      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-          hw_info.device_id);
-
-  int batches = q_nope.sizes()[0];
-  int page_count_per_seq = page_table.sizes()[1];
-  int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
-  int page_size = kv_c_and_k_pe_cache.sizes()[1];
-  int max_seq_len = page_size * page_count_per_seq;
-  using TileShapeH = typename T::TileShapeH;
-  using TileShapeD = typename T::TileShapeD;
-  auto problem_shape =
-      cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
-
-  auto [H, K, D, B] = problem_shape;
-  auto [D_latent, D_rope] = D;
-
-  using StrideQ = typename T::StrideQ;
-  using StrideK = typename T::StrideK;
-  using StrideO = typename T::StrideO;
-  using StrideLSE = typename T::StrideLSE;
-
-  StrideQ stride_Q_latent = cute::make_tuple(
-      static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
-  StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
-                                           static_cast<int64_t>(H * D_rope));
-  StrideK stride_C =
-      cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
-                       static_cast<int64_t>(page_size * (D_latent + D_rope)));
-  StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
-  StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
-  StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
-                                      static_cast<int64_t>(H * D_latent));
-
-  using Element = typename T::Element;
-  using ElementOut = typename T::ElementOut;
-  using ElementAcc = typename T::ElementAcc;
-  auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
-  auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
-  auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
-  auto scale_f = static_cast<float>(scale);
-  typename T::Fmha::Arguments arguments{
-      problem_shape,
-      {scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
-       stride_C, C_ptr + D_latent, stride_C,
-       static_cast<int*>(seq_lens.data_ptr()),
-       static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
-       page_size},
-      {static_cast<ElementOut*>(out.data_ptr()), stride_O,
-       static_cast<ElementAcc*>(nullptr), stride_LSE},
-      hw_info,
-      1,        // split_kv
-      nullptr,  // is_var_split_kv
-  };
-  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
-  // split_kv automatically based on batch size and sequence length to balance
-  // workload across available SMs. Consider using var_split_kv for manual
-  // control if needed.
-  T::Fmha::set_split_kv(arguments);
-  return arguments;
-}
-
-template <typename Element>
-void runMla(at::Tensor const& out, at::Tensor const& q_nope,
-            at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
-            at::Tensor const& seq_lens, at::Tensor const& page_table,
-            float scale, cudaStream_t stream) {
-  using MlaSm100Type = MlaSm100<Element>;
-  typename MlaSm100Type::Fmha fmha;
-  auto arguments = args_from_options<MlaSm100Type>(
-      out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
-  size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  CUTLASS_CHECK(fmha.can_implement(arguments));
-
-  CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
-
-  CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
-}
-
-void cutlass_mla_decode_sm100a(torch::Tensor const& out,
-                               torch::Tensor const& q_nope,
-                               torch::Tensor const& q_pe,
-                               torch::Tensor const& kv_c_and_k_pe_cache,
-                               torch::Tensor const& seq_lens,
-                               torch::Tensor const& page_table, double scale) {
-  TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
-  TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
-  TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
-  TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
-              "kv_c_and_k_pe_cache must be a 3D tensor");
-  TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
-  TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
-  TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
-
-  auto B_q_nope = q_nope.size(0);
-  auto H_q_nope = q_nope.size(1);
-  auto D_q_nope = q_nope.size(2);
-  auto B_q_pe = q_pe.size(0);
-  auto H_q_pe = q_pe.size(1);
-  auto D_q_pe = q_pe.size(2);
-  auto B_pt = page_table.size(0);
-  auto PAGE_NUM = page_table.size(1);
-  auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
-  auto D_ckv = kv_c_and_k_pe_cache.size(2);
-  auto B_o = out.size(0);
-  auto H_o = out.size(1);
-  auto D_o = out.size(2);
-
-  TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
-  TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
-  TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
-  TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
-              "H_q_nope, H_q_pe, and H_o must be equal to 128");
-  TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
-              "PAGE_SIZE must be a power of 2");
-  TORCH_CHECK(
-      B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
-      "Batch dims must be same for page_table, q_nope and q_pe, and out");
-  TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
-              "PAGE_NUM must be divisible by 128 / PAGE_SIZE");
-  TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
-
-  TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
-                  q_nope.dtype() == at::ScalarType::BFloat16 ||
-                  q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
-              "q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
-  TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
-                  q_nope.dtype() == q_pe.dtype(),
-              "kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
-  TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
-              "seq_lens must be a 32-bit integer tensor");
-  TORCH_CHECK(page_table.dtype() == torch::kInt32,
-              "page_table must be a 32-bit integer tensor");
-
-  auto in_dtype = q_nope.dtype();
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
-  const cudaStream_t stream =
-      at::cuda::getCurrentCUDAStream(q_nope.get_device());
-  if (in_dtype == at::ScalarType::Half) {
-    runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
-                            page_table, scale, stream);
-  } else if (in_dtype == at::ScalarType::BFloat16) {
-    runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
-                                seq_lens, page_table, scale, stream);
-  } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
-    runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
-                                  seq_lens, page_table, scale, stream);
-  } else {
-    TORCH_CHECK(false, "Unsupported input data type of MLA");
-  }
-}
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@ -43,6 +43,7 @@ void sm100_cutlass_mla_decode(
    torch::Tensor const& seq_lens,
    torch::Tensor const& page_table,
    torch::Tensor const& workspace,
+    double sm_scale,
    int64_t num_kv_splits) {
  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
 }
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@ -12,7 +12,7 @@ namespace vec_op {
 #define vec_sub(a, b) ((a) - (b))
 #define vec_mul(a, b) ((a) * (b))
 #define vec_div(a, b) ((a) / (b))
-#define vec_sr(a, b) ((a) >> (b))  // Vector Shift Right Algebaic
+#define vec_sr(a, b) ((a) >> (b))  // Vector Shift Right Algebraic
 #define vec_sl(a, b) ((a) << (b))  // Vector Shift Left

 // FIXME: FP16 is not fully supported in Torch-CPU
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@ -523,7 +523,7 @@ void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
  CPU_KERNEL_GUARD_IN(onednn_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.stride(-1) == 1);
-  TORCH_CHECK(c.is_contiguous());
+  TORCH_CHECK(c.stride(-1) == 1);
  MatMulPrimitiveHandler* ptr =
      reinterpret_cast<MatMulPrimitiveHandler*>(handler);

--- a/csrc/cpu/sgl-kernels/moe.cpp
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@ -215,7 +215,7 @@ int moe_align_block_size(
      offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
    }
  });
-  // TODO: do we need to vecterize this ?
+  // TODO: do we need to vectorize this ?
  for (int mb = 0; mb < num_token_blocks; ++mb) {
    offsets[mb + 1] += offsets[mb];
  }
--- a/csrc/cub_helpers.h
+++ b/csrc/cub_helpers.h
@ -0,0 +1,17 @@
+#pragma once
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+  #if CUB_VERSION >= 200800
+    #include <cuda/std/functional>
+using CubAddOp = cuda::std::plus<>;
+using CubMaxOp = cuda::maximum<>;
+  #else   // if CUB_VERSION < 200800
+using CubAddOp = cub::Sum;
+using CubMaxOp = cub::Max;
+  #endif  // CUB_VERSION
+#else
+  #include <hipcub/hipcub.hpp>
+using CubAddOp = cub::Sum;
+using CubMaxOp = cub::Max;
+#endif  // USE_ROCM
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@ -15,6 +15,8 @@ typedef __hip_bfloat16 nv_bfloat16;
 #include <map>
 #include <unordered_map>
 #include <vector>
+#include <cstdlib>
+#include <cstring>

 namespace vllm {
 #define CUDACHECK(cmd)                                              \
@ -555,22 +557,47 @@ class CustomAllreduce {
    size /= d;
    auto bytes = size * sizeof(typename packed_t<T>::P);
    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+
+    // Check environment variable once
+    const char* env_algo = std::getenv("VLLM_CUSTOM_ALLREDUCE_ALGO");
+    bool force_1stage = false;
+    bool force_2stage = false;
+    if (env_algo != nullptr) {
+      if (std::strcmp(env_algo, "1stage") == 0 ||
+          std::strcmp(env_algo, "oneshot") == 0) {
+        force_1stage = true;
+      } else if (std::strcmp(env_algo, "2stage") == 0 ||
+                 std::strcmp(env_algo, "twoshot") == 0) {
+        force_2stage = true;
+      } else {
+        throw std::runtime_error(
+            "Invalid VLLM_CUSTOM_ALLREDUCE_ALGO: " + std::string(env_algo) +
+            ". Valid values: 1stage, oneshot, 2stage, twoshot");
+      }
+    }
+
 #define KL(ngpus, name)                                                       \
  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                 rank_, size);
-#define REDUCE_CASE(ngpus)                            \
-  case ngpus: {                                       \
-    if (world_size_ == 2) {                           \
-      KL(ngpus, cross_device_reduce_1stage);          \
-    } else if (fully_connected_) {                    \
-      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
-          (world_size_ <= 8 && bytes < 256 * 1024)) { \
-        KL(ngpus, cross_device_reduce_1stage);        \
-      } else {                                        \
-        KL(ngpus, cross_device_reduce_2stage);        \
-      }                                               \
-    }                                                 \
-    break;                                            \
+#define REDUCE_CASE(ngpus)                              \
+  case ngpus: {                                         \
+    if (force_1stage) {                                 \
+      KL(ngpus, cross_device_reduce_1stage);            \
+    } else if (force_2stage) {                          \
+      KL(ngpus, cross_device_reduce_2stage);            \
+    } else {                                            \
+      if (world_size_ == 2) {                           \
+        KL(ngpus, cross_device_reduce_1stage);          \
+      } else if (fully_connected_) {                    \
+        if ((world_size_ <= 4 && bytes < 512 * 1024) || \
+            (world_size_ <= 8 && bytes < 256 * 1024)) { \
+          KL(ngpus, cross_device_reduce_1stage);        \
+        } else {                                        \
+          KL(ngpus, cross_device_reduce_2stage);        \
+        }                                               \
+      }                                                 \
+    }                                                   \
+    break;                                              \
  }

    switch (world_size_) {
--- a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
@ -1,123 +0,0 @@
-// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
-// clang-format off
-#pragma once
-
-#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
-
-#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_TMA_WS_SS (BlockScaled Builders)
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  int ScaleGranularityM
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
-    cute::enable_if_t<
-      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
-> {
-  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
-
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Should meet TMA alignment requirement\n");
-
-  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
-                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
-                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
-  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
-  static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
-                "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
-
-  // For fp32 types, map to tf32 MMA value type
-  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
-  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
-
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
-
-  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
-                                                          KernelTmaWarpSpecializedCooperative,
-                                                          KernelPtrArrayTmaWarpSpecializedCooperative,
-                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
-  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
-      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
-      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
-  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
-      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
-
-  using SmemCopyAtomA = void;
-  using SmemCopyAtomB = void;
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
@ -1,183 +0,0 @@
-// clang-format off
-// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp
-
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cute/algorithm/clear.hpp"
-#include "cute/tensor.hpp"
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////FP8 Accumulation///////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-/// This class provides API to promote (add) or scale (multiply_add) the results
-/// from the tensor core accumulators to the main accumulators when the number 
-/// of MMAs reaches the max number of MMA interval specified by user, after that
-/// the tensor core accumulators are zeroed.
-//////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-template <
-    class EngineAccum,
-    class LayoutAccum>
-struct GmmaFP8AccumulationWithScale {  
-  using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
-  using ElementAccumulator = typename EngineAccum::value_type;
-
-  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
-  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
-
-private:
-  TensorAccum& accum_;
-  TensorAccum accum_temp_;
-
-  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
-  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
-  uint32_t mma_count_;                        // current executed MMAs
-  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
-
-  // promote or `add` the partial accumulators to main accumulator (FADD).
-  CUTLASS_DEVICE
-  void promote_core() {
-    warpgroup_wait<0>();
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < size(accum_); ++i) {
-      accum_(i) += accum_temp_(i);
-    }
-  }
-
-  // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA).
-  template <
-    class EngineScale,
-    class LayoutScale>
-  CUTLASS_DEVICE
-  void scale_core(const cute::Tensor<EngineScale, LayoutScale> &scale) {
-    using TensorScale = cute::Tensor<EngineScale, LayoutScale>;
-
-    static_assert(is_static<LayoutScale>::value, "Scale Layout should be static");
-    static_assert(is_rmem<TensorScale>::value , "Scale tensor must be rmem resident.");
-
-    static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape.");
-
-    warpgroup_wait<0>();
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < size(accum_); ++i) {
-      accum_(i) += accum_temp_(i) * scale(i);
-    }
-  }
-
-public:
-  CUTLASS_DEVICE
-  GmmaFP8AccumulationWithScale(
-      TensorAccum &accum,
-      uint32_t accum_promotion_interval,
-      uint32_t mma_count_per_mainloop_iteration)
-      : accum_(accum), 
-        accum_promotion_interval_(accum_promotion_interval),
-        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
-        mma_count_(0), 
-        reset_accum_flag_(0) 
-  {
-    accum_temp_ = cute::make_fragment_like(accum);
-  }
-
-  //
-  // Methods (Common)
-  //
-
-  CUTLASS_DEVICE 
-  TensorAccum& operator()() {
-    return accum_temp_;
-  }
-
-  /// prepare the MMA accumulators when initialization or zeroing is required.
-  CUTLASS_DEVICE
-  bool prepare_if_needed() { 
-    return reset_accum_flag_;
-  }
-
-  //
-  // Methods (for FADD version)
-  //
-
-  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
-  CUTLASS_DEVICE
-  void promote_if_needed() {
-    mma_count_ += mma_count_per_mainloop_iteration_;
-    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
-    if (reset_accum_flag_) {
-      promote_core();
-      mma_count_ = 0;
-    }
-  }
-
-  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
-  CUTLASS_DEVICE
-  void promote_residue_if_needed() {
-    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
-      promote_core();
-    }
-  }
-
-  //
-  // Methods (for FFMA version)
-  //
-
-  /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed.
-  template <
-    class EngineScale,
-    class LayoutScale>
-  CUTLASS_DEVICE
-  void scale_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
-    mma_count_ += mma_count_per_mainloop_iteration_;
-    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
-    if (reset_accum_flag_) {
-      scale_core(scale);
-      mma_count_ = 0;
-    }
-  }
-
-  /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed.
-  template <
-    class EngineScale,
-    class LayoutScale>
-  CUTLASS_DEVICE
-  void scale_residue_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
-    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
-      scale_core(scale);
-    }
-  }
-};
-
-} // namespace cutlass::gemm::collective
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@ -1,729 +0,0 @@
-// clang-format off
-// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
-
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/numeric_types.h"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm80.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop
-template <
-  int Stages,
-  class ClusterShape,
-  class KernelSchedule,
-  int ScaleGranularityM_,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using ElementBlockScale = ElementAccumulator;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
-  using PipelineParams = typename MainloopPipeline::Params;
-
-  // Two threads per CTA are producers (1 for operand tile and 32 for scales)
-  static constexpr int NumProducerThreadEvents = 33; 
-
-  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
-  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
-
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  
-  // Block scaling gmem-to-smem copy atom 
-  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
-  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
-  
-  // Block scaling smem layout
-  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
-  using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
-             "ElementAccumulator and ElementBlockScale should be same datatype");
-
-  struct SharedStorage
-  {
-    struct TensorStorage : cute::aligned_struct<128> {
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-    ElementBlockScale const* ptr_scale_A; 
-    ElementBlockScale const* ptr_scale_B;
-  };
-
-  // Device side kernel params
-  struct Params {
-    // Assumption: StrideA is congruent with Problem_MK
-    using TMA_A = decltype(make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
-        SmemLayoutA{}(_,_,0),
-        TileShape{},
-        ClusterShape{}));
-    // Assumption: StrideB is congruent with Problem_NK
-    using TMA_B = decltype(make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
-        SmemLayoutB{}(_,_,0),
-        TileShape{},
-        ClusterShape{}));
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
-    // Block scaling factors for A and B
-    ElementBlockScale const* ptr_scale_A; 
-    ElementBlockScale const* ptr_scale_B;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
-    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
-
-    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
-    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
-    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
-    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
-
-    return {
-      tma_load_a,
-      tma_load_b,
-      transaction_bytes,
-      transaction_bytes_mk,
-      transaction_bytes_nk,
-      args.ptr_scale_A,
-      args.ptr_scale_B
-    };
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    
-    bool implementable = true;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = 1;
-  static constexpr uint32_t TmaTransactionBytesMK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
-  static constexpr uint32_t TmaTransactionBytesNK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
-  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params)
-  {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-  }
-
-  /// Set up the data needed by this collective for load and mma.
-  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
-  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
-
-    constexpr auto scales_m = Int<ScaleMsPerTile>{};
-    auto tM = get<2>(gA_mkl.shape());
-    auto tN = get<2>(gB_nkl.shape());
-    auto tK = get<3>(gA_mkl.shape());
-
-    // Make the tiled views of scale tensors
-    auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
-    auto scaleA_layout = make_ordered_layout(scaleA_shape,  Step<_0, _1, _2>{});
-    auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
-    auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
-
-    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and 
-    // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
-    Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
-    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
-
-    return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA, class TensorB,
-    class TensorScaleA, class TensorScaleB,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_write,
-      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Blockscaling: Tma loads for load_input and CpAsync for load_scale
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-    Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
-    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
-
-    //
-    // Prepare the TMA loads for A and B
-    //
-
-    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
-    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-
-    Tensor gA_mkl = get<0>(load_inputs);
-    Tensor gB_nkl = get<1>(load_inputs);
-
-    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-
-    // Partition the inputs based on the current block coordinates.
-    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-
-
-    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
-    Tensor mScaleA_mkl = get<2>(load_inputs);
-    Tensor mScaleB_nkl = get<3>(load_inputs);
-    auto scales_m = get<0>(mScaleA_mkl.shape());
-
-    Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
-
-    Tensor gScaleA = local_tile( 
-      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
-      make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
-    Tensor cScaleA = local_tile( 
-      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
-      make_coord(m_coord,_,l_coord));
-    Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
-
-    // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
-    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
-      Layout<Shape<_32>>{}, Layout<Shape<_1>>{}); // (1,1,1)
-    TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
-      Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
-    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
-    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
-    
-    Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
-    Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
-    Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
-    
-    Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
-    Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
-
-    // Applies the mapping from block_tma_a
-    Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
-    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-    Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
-    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
-
-    uint16_t mcast_mask_a = 0;
-    uint16_t mcast_mask_b = 0;
-
-    // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
-    // Maps the tile -> block, value
-    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-      for (int n = 0; n < size<1>(block_layout); ++n) {
-        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-      }
-    }
-
-    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-      for (int m = 0; m < size<0>(block_layout); ++m) {
-        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-      }
-    }
-
-    // Allocate predicate tensors for a_scales (since we can't guarantee that 
-    // all scales are valid, since we could have a partial tiles along M)
-    Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
-    #pragma unroll
-    for (int i = 0; i < size(tApA_ScaleA); ++i) {
-      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m;
-    }
-
-    // Mainloop
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-      // LOCK smem_pipe_write for _writing_
-      pipeline.producer_acquire(smem_pipe_write);
-
-      //
-      // Copy gmem to smem for *k_tile_iter
-      //
-      int write_stage = smem_pipe_write.index();
-      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-      // Copy operands A and B from global memory to shared memory
-      if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-      if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-
-      // Copy scale tensors from global memory to shared memory
-      copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
-      copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
-
-      ++k_tile_iter;
-
-      // Advance smem_pipe_write
-      ++smem_pipe_write;
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_write) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Issue the epilogue waits
-    if (lane_predicate) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all
-       * Consumer UNLOCKs), or if the stage was never used
-       * then would just be acquired since the phase was
-       * still inverted from make_producer_start_state
-       */
-      pipeline.producer_tail(smem_pipe_write);
-    }
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-
-
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::is_void_v<SmemCopyAtomA>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<SmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-    
-    // Block scaling
-    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
-      Layout<
-        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
-        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
-      >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
-    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-    
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
-                  stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
-
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-    
-    // Per block scale values for operand A and B
-
-    using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
-    using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
-
-    Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
-    ElementBlockScale scale_b;
-
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
-    GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA));
-    warpgroup_fence_operand(accumulation());
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      if (accumulation.prepare_if_needed()) {
-        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      }
-
-      int read_stage = smem_pipe_read.index();
-      
-      // Load per block scale values from shared memory to registers.
-      scale_b = sScaleB[read_stage];
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
-      }
-      if constexpr (ScaleMsPerTile == 1) {
-        static_assert(size(RegLayoutScaleAEssential{}) == 1);
-        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
-      } else {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
-        }
-      }
-
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
-      accumulation.scale_if_needed(tCrScaleAViewAsC);
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accumulation());
-    // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-
-      // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) 
-      scale_b = sScaleB[read_stage];
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
-      }
-      if constexpr (ScaleMsPerTile == 1) {
-        static_assert(size(RegLayoutScaleAEssential{}) == 1);
-        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
-      } else {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
-        }
-      }
-
-      if (accumulation.prepare_if_needed()) {
-        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      }
-
-      warpgroup_fence_operand(accumulation());
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accumulation());
-
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
-      accumulation.scale_if_needed(tCrScaleAViewAsC);
-
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-
-      // Advance smem_pipe_read and smem_pipe_release
-      ++smem_pipe_read;
-      ++smem_pipe_release;
-    }
-    
-    accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
-
-    warpgroup_fence_operand(accumulation());
-  }
-
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
+++ b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
@ -1,39 +0,0 @@
-#pragma once
-
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-namespace cutlass::gemm {
-
-//////////////////////////////////////////////////////////////////////////////
-
-// FP8 related policies (including Blocked Scaled Accumulation)
-//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
-//  `ScaleGranularityM` indicates that scaling granularity is
-//  `size<0>(TileShape_MNK{})` along M.
-template <int ScaleGranularityM = 0>
-struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum
-    : KernelTmaWarpSpecializedCooperative {};
-
-// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
-// specialized dynamic schedule For FP8 kernels with Block Scaling
-template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
-          class KernelSchedule = KernelTmaWarpSpecialized,
-          int ScaleGranularityM =
-              0  // `ScaleGranularityM` specifies scaling granularity along M,
-                 // while zero-value `ScaleGranularityM` indicates that scaling
-                 // granularity is `size<0>(TileShape_MNK{})` along M.
-          >
-struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
-    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,
-                                         KernelSchedule> {
-  static_assert(
-      cute::is_same_v<
-          KernelSchedule,
-          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
-              ScaleGranularityM>>,
-      "KernelSchedule must be one of the warp specialized policies");
-};
-
-//////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::gemm
--- a/csrc/cutlass_extensions/vllm_collective_builder.cuh
+++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh
@ -1,6 +1,6 @@
 #pragma once

-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"

 namespace cutlass::gemm::collective {
 using namespace cute;
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -1,15 +1,10 @@
 #include "type_convert.cuh"
 #include "dispatch_utils.h"
+#include "cub_helpers.h"

 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>

-#ifndef USE_ROCM
-  #include <cub/cub.cuh>
-#else
-  #include <hipcub/hipcub.hpp>
-#endif
-
 namespace vllm {

 // TODO(woosuk): Further optimize this kernel.
@ -30,7 +25,7 @@ __global__ void rms_norm_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -85,7 +80,7 @@ fused_add_rms_norm_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -126,7 +121,7 @@ fused_add_rms_norm_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -140,6 +135,211 @@ fused_add_rms_norm_kernel(
  }
 }

+/* Function specialization in the case of FP16/BF16 tensors.
+   Additional optimizations we can make in this case are
+   packed and vectorized operations, which help with the
+   memory latency bottleneck.
+
+   _f16VecPN struct extends _f16Vec to add operations specifically required for
+   polynomial normalization (poly norm).
+   The original _f16Vec does not include the sum-of-powers computation or
+   in-place polynomial normalization logic. */
+template <typename scalar_t, int width>
+struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
+  using Base = _f16Vec<scalar_t, width>;
+  using Converter = typename Base::Converter;
+  using T1 = typename Base::T1;
+  using T2 = typename Base::T2;
+  using Base::data;
+
+  __device__ auto sum_pows() const {
+    float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < width; i += 2) {
+      float2 z = Converter::convert(T2{data[i], data[i + 1]});
+      float x2 = z.x * z.x;
+      float x4 = x2 * x2;
+      float x6 = x4 * x2;
+
+      float y2 = z.y * z.y;
+      float y4 = y2 * y2;
+      float y6 = y4 * y2;
+
+      s2 += x2 + y2;
+      s4 += x4 + y4;
+      s6 += x6 + y6;
+    }
+    return std::make_tuple(s2, s4, s6);
+  }
+
+  __device__ void poly_norm_inplace(const float w2_inv_std,
+                                    const float w1_inv_std2,
+                                    const float w0_inv_std3, const float bias) {
+#pragma unroll
+    for (int i = 0; i < width; i += 2) {
+      float2 z = Converter::convert(T2{data[i], data[i + 1]});
+
+      float x2 = z.x * z.x;
+      float x3 = x2 * z.x;
+      z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
+
+      float y2 = z.y * z.y;
+      float y3 = y2 * z.y;
+      z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
+
+      auto out = Converter::convert(z);
+      data[i] = out.x;
+      data[i + 1] = out.y;
+    }
+  }
+};
+
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
+poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
+                 const scalar_t* __restrict__ input,   // [..., hidden_size]
+                 const scalar_t* __restrict__ weight,  // [3]
+                 const scalar_t* __restrict__ bias,    // [1]
+                 const float epsilon, const int hidden_size) {
+  // Sanity checks on our vector struct and type-punned pointer arithmetic
+  static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
+  static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
+
+  /* These and the argument pointers are all declared `restrict` as they are
+     not aliased in practice. Argument pointers should not be dereferenced
+     in this kernel as that would be undefined behavior */
+  auto* __restrict__ input_v =
+      reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
+  const int vec_hidden_size = hidden_size / width;
+  float variance = 0.0f;
+  float variance2 = 0.0f;
+  float variance3 = 0.0f;
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16VecPN<scalar_t, width> temp = input_v[id];
+    auto [x2, x4, x6] = temp.sum_pows();
+
+    variance += x2;
+    variance2 += x4;
+    variance3 += x6;
+  }
+
+  float3 thread_variances = make_float3(variance, variance2, variance3);
+
+  struct SumOp {
+    __device__ float3 operator()(const float3& a, const float3& b) const {
+      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+  };
+
+  using BlockReduce = cub::BlockReduce<float3, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  float3 block_variances =
+      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
+
+  variance = block_variances.x;
+  variance2 = block_variances.y;
+  variance3 = block_variances.z;
+
+  __shared__ float s_w2_inv_std;
+  __shared__ float s_w1_inv_std2;
+  __shared__ float s_w0_inv_std3;
+  __shared__ float s_bias;
+
+  if (threadIdx.x == 0) {
+    float w0 = (float)weight[0];
+    float w1 = (float)weight[1];
+    float w2 = (float)weight[2];
+    s_bias = (float)bias[0];
+
+    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
+    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
+    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16VecPN<scalar_t, width> temp = input_v[id];
+    temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
+    out_v[id] = temp;
+  }
+}
+
+/* Generic poly_norm_kernel
+   The width field is not used here but necessary for other specializations.
+ */
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
+poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
+                 const scalar_t* __restrict__ input,   // [..., hidden_size]
+                 const scalar_t* __restrict__ weight,  // [3]
+                 const scalar_t* __restrict__ bias,    // [1]
+                 const float epsilon, const int hidden_size) {
+  float variance = 0.0f;
+  float variance2 = 0.0f;
+  float variance3 = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * hidden_size + idx];
+    float x2 = x * x;
+    float x4 = x2 * x2;
+    float x6 = x4 * x2;
+
+    variance += x2;
+    variance2 += x4;
+    variance3 += x6;
+  }
+
+  float3 thread_variances = make_float3(variance, variance2, variance3);
+
+  struct SumOp {
+    __device__ float3 operator()(const float3& a, const float3& b) const {
+      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+  };
+
+  using BlockReduce = cub::BlockReduce<float3, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  float3 block_variances =
+      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
+
+  variance = block_variances.x;
+  variance2 = block_variances.y;
+  variance3 = block_variances.z;
+
+  __shared__ float s_w2_inv_std;
+  __shared__ float s_w1_inv_std2;
+  __shared__ float s_w0_inv_std3;
+  __shared__ float s_bias;
+
+  if (threadIdx.x == 0) {
+    float w0 = (float)weight[0];
+    float w1 = (float)weight[1];
+    float w2 = (float)weight[2];
+    s_bias = (float)bias[0];
+
+    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
+    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
+    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * hidden_size + idx];
+    float x2 = x * x;
+    float x3 = x2 * x;
+
+    out[blockIdx.x * hidden_size + idx] =
+        (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
+                   s_bias);
+  }
+}
+
 }  // namespace vllm

 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
@ -219,3 +419,49 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
    LAUNCH_FUSED_ADD_RMS_NORM(0);
  }
 }
+
+#define LAUNCH_FUSED_POLY_NORM(width)                                         \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
+    vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>(      \
+        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),                 \
+        weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon,      \
+        hidden_size);                                                         \
+  });
+
+void poly_norm(torch::Tensor& out,     // [..., hidden_size]
+               torch::Tensor& input,   // [..., hidden_size]
+               torch::Tensor& weight,  // [3]
+               torch::Tensor& bias,    // [1]
+               double epsilon) {
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.data_ptr() != input.data_ptr());
+
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  /* This kernel is memory-latency bound in many scenarios.
+     When num_tokens is large, a smaller block size allows
+     for increased block occupancy on CUs and better latency
+     hiding on global mem ops. */
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(hidden_size, max_block_size));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  /*If the tensor types are FP16/BF16, try to use the optimized kernel
+    with packed + vectorized ops.
+    Max optimization is achieved with a width-8 vector of FP16/BF16s
+    since we can load at most 128 bits at once in a global memory op.
+    However, this requires each tensor's data to be aligned to 16
+    bytes.
+   */
+  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
+  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
+  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
+  if (ptrs_are_aligned && hidden_size % 8 == 0) {
+    LAUNCH_FUSED_POLY_NORM(8);
+  } else {
+    LAUNCH_FUSED_POLY_NORM(0);
+  }
+}
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -8,16 +8,11 @@
 #include "type_convert.cuh"
 #include "quantization/fp8/common.cuh"
 #include "dispatch_utils.h"
+#include "cub_helpers.h"

 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>

-#ifndef USE_ROCM
-  #include <cub/cub.cuh>
-#else
-  #include <hipcub/hipcub.hpp>
-#endif
-
 namespace vllm {

 // TODO(woosuk): Further optimize this kernel.
@ -39,7 +34,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -100,7 +95,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -149,7 +144,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);

  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -20,17 +20,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
-
-#ifndef USE_ROCM
-    #include <cub/util_type.cuh>
-    #include <cub/cub.cuh>
-    #include <cuda/std/functional>
-    using AddOp = cuda::std::plus<float>;
-#else
-    #include <hipcub/util_type.hpp>
-    #include <hipcub/hipcub.hpp>
-    using AddOp = cub::Sum; 
-#endif
+#include "../cub_helpers.h"

 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -79,7 +69,7 @@ __launch_bounds__(TPB) __global__
        threadData = max(static_cast<float>(input[idx]), threadData);
    }

-    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
+    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
    if (threadIdx.x == 0)
    {
        float_max = maxElem;
@ -94,7 +84,7 @@ __launch_bounds__(TPB) __global__
        threadData += exp((static_cast<float>(input[idx]) - float_max));
    }

-    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());
+    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());

    if (threadIdx.x == 0)
    {
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -92,6 +92,9 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);

+void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
+               torch::Tensor& bias, double epsilon);
+
 void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& prompt_mask,
                                 const torch::Tensor& output_mask,
@ -119,12 +122,6 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      std::optional<torch::Tensor> key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox);

-void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                              std::optional<torch::Tensor> key,
-                              int64_t head_size, torch::Tensor& cos_sin_cache,
-                              bool is_neox, int64_t rot_dim,
-                              torch::Tensor& cos_sin_cache_offsets);
-
 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);

 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
@ -136,6 +133,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                              torch::Tensor& input,
                              torch::Tensor& input_global_scale);
 #endif
+void silu_mul_fp8_quant_deep_gemm_cuda(
+    const at::Tensor& input,   // (E, T, 2*H)
+    const at::Tensor& counts,  // (E)
+    at::Tensor& y_q,           // (E, T, H) [OUT]
+    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
+    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);

 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

@ -344,6 +347,8 @@ std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
 int64_t open_mem_handle(torch::Tensor& mem_handle);
 void free_shared_buffer(int64_t buffer);

+torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace);
+
 #ifdef USE_ROCM
 fptr_t init_custom_qr(int64_t rank, int64_t world_size,
                      std::optional<int64_t> qr_max_size = std::nullopt);
@ -353,4 +358,4 @@ void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
 void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                   int64_t quant_level, bool cast_bf2half = false);
 int64_t qr_max_size();
-#endif
+#endif
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@ -99,35 +99,6 @@ __global__ void rotary_embedding_kernel(
      token_idx, query_stride, key_stride, head_stride);
 }

-template <typename scalar_t, bool IS_NEOX>
-__global__ void batched_rotary_embedding_kernel(
-    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
-                                            // [num_tokens]
-    scalar_t* __restrict__ query,           // [batch_size, seq_len, num_heads,
-                                   // head_size] or [num_tokens, num_heads,
-                                   // head_size]
-    scalar_t* __restrict__ key,  // nullptr or
-                                 // [batch_size, seq_len, num_kv_heads,
-                                 // head_size] or [num_tokens, num_kv_heads,
-                                 // head_size]
-    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
-                                                 // 2]
-    const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len]
-    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
-    const int64_t head_stride, const int num_heads, const int num_kv_heads,
-    const int head_size) {
-  // Each thread block is responsible for one token.
-  const int token_idx = blockIdx.x;
-  int64_t pos = positions[token_idx];
-  int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
-  const scalar_t* cache_ptr =
-      cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
-
-  apply_rotary_embedding<scalar_t, IS_NEOX>(
-      query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
-      token_idx, query_stride, key_stride, head_stride);
-}
-
 }  // namespace vllm

 void rotary_embedding(
@ -211,96 +182,3 @@ void rotary_embedding(
    }
  });
 }
-
-/*
-Batched version of rotary embedding, pack multiple LoRAs together
-and process in batched manner.
-*/
-void batched_rotary_embedding(
-    torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
-    torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
-                           // [num_tokens, num_heads * head_size] or
-                           // [batch_size, seq_len, num_heads, head_size] or
-                           // [num_tokens, num_heads, head_size]
-    std::optional<torch::Tensor>
-        key,  // null or
-              // [batch_size, seq_len, num_kv_heads * head_size] or
-              // [num_tokens, num_kv_heads * head_size] or
-              // [batch_size, seq_len, num_heads, head_size] or
-              // [num_tokens, num_heads, head_size]
-    int64_t head_size,
-    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
-    bool is_neox, int64_t rot_dim,
-    torch::Tensor& cos_sin_cache_offsets  // [num_tokens] or [batch_size]
-) {
-  // num_tokens = batch_size * seq_len
-  int64_t num_tokens = cos_sin_cache_offsets.size(0);
-  TORCH_CHECK(
-      positions.size(0) == num_tokens || positions.numel() == num_tokens,
-      "positions must have the same num_tokens or batch_size as "
-      "cos_sin_cache_offsets");
-
-  int positions_ndim = positions.dim();
-  // Make sure num_tokens dim is consistent across positions, query, and key
-  TORCH_CHECK(
-      positions_ndim == 1 || positions_ndim == 2,
-      "positions must have shape [num_tokens] or [batch_size, seq_len]");
-  if (positions_ndim == 1) {
-    TORCH_CHECK(query.size(0) == positions.size(0) &&
-                    (!key.has_value() || key->size(0) == positions.size(0)),
-                "query, key and positions must have the same number of tokens");
-  }
-  if (positions_ndim == 2) {
-    TORCH_CHECK(
-        query.size(0) == positions.size(0) &&
-            (!key.has_value() || key->size(0) == positions.size(0)) &&
-            query.size(1) == positions.size(1) &&
-            (!key.has_value() || key->size(1) == positions.size(1)),
-        "query, key and positions must have the same batch_size and seq_len");
-  }
-
-  // Make sure head_size is valid for query and key
-  int query_hidden_size = query.numel() / num_tokens;
-  int key_hidden_size = key.has_value() ? key->numel() / num_tokens : 0;
-  TORCH_CHECK(query_hidden_size % head_size == 0);
-  TORCH_CHECK(key_hidden_size % head_size == 0);
-
-  // Make sure query and key have concistent number of heads
-  int num_heads = query_hidden_size / head_size;
-  int num_kv_heads = key.has_value() ? key_hidden_size / head_size : num_heads;
-  TORCH_CHECK(num_heads % num_kv_heads == 0);
-
-  int seq_dim_idx = positions_ndim - 1;
-  int64_t query_stride = query.stride(seq_dim_idx);
-  int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
-  // Determine head stride: for [*, heads, head_size] use stride of last dim;
-  // for flat [*, heads*head_size], heads blocks are contiguous of size
-  // head_size
-  int query_ndim = query.dim();
-  int64_t head_stride =
-      (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
-    if (is_neox) {
-      vllm::batched_rotary_embedding_kernel<scalar_t, true>
-          <<<grid, block, 0, stream>>>(
-              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
-              cos_sin_cache.data_ptr<scalar_t>(),
-              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
-              key_stride, head_stride, num_heads, num_kv_heads, head_size);
-    } else {
-      vllm::batched_rotary_embedding_kernel<scalar_t, false>
-          <<<grid, block, 0, stream>>>(
-              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
-              cos_sin_cache.data_ptr<scalar_t>(),
-              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
-              key_stride, head_stride, num_heads, num_kv_heads, head_size);
-    }
-  });
-}
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -9,6 +9,26 @@

 #include "quantization/fp8/common.cuh"

+#include <c10/util/Float8_e4m3fn.h>
+
+#ifndef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+  #include <cuda_fp8.h>
+#else
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+  #include <hip/hip_fp8.h>
+
+typedef __hip_bfloat162 __nv_bfloat162;
+typedef __hip_bfloat16 __nv_bfloat16;
+typedef __hip_bfloat16_raw __nv_bfloat16_raw;
+
+typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
+typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
+#endif
+
+#include "core/registration.h"
 namespace vllm {

 template <typename T>
@ -87,6 +107,337 @@ __global__ void act_and_mul_quant_kernel(
    }
  }
 }
+
+__device__ __forceinline__ float silu(float x) {
+  return (__fdividef(x, (1.f + expf(-x))));
+}
+
+__device__ __forceinline__ float2 silu2(float2 x) {
+  return make_float2(silu(x.x), silu(x.y));
+}
+
+#ifndef USE_ROCM
+__device__ __forceinline__ float warp_max(float v) {
+  static constexpr unsigned FULL_MASK = 0xffffffffu;
+  for (int offset = 1; offset < WARP_SIZE; offset *= 2) {
+    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, offset));
+  }
+  return v;
+}
+
+__device__ __forceinline__ __nv_bfloat16 warp_max(__nv_bfloat16 v) {
+  static constexpr unsigned FULL_MASK = 0xffffffffu;
+  for (int offset = 1; offset < WARP_SIZE; offset *= 2) {
+    v = __hmax(v, __shfl_xor_sync(FULL_MASK, v, offset));
+  }
+  return v;
+}
+#endif
+
+template <typename T, typename U>
+__device__ __forceinline__ void cp_async4(T* _smem_ptr, const U* _glob_ptr) {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  auto smem_ptr = reinterpret_cast<void*>(_smem_ptr);
+  auto glob_ptr = reinterpret_cast<const void*>(_glob_ptr);
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+#else
+  _smem_ptr[0] = _glob_ptr[0];
+#endif
+}
+
+__device__ __forceinline__ void cp_async_fence() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.commit_group;\n" ::);
+#else
+#endif
+}
+
+template <int N>
+__device__ __forceinline__ void cp_async_wait() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+#else
+#endif
+}
+
+template <>
+__device__ __forceinline__ void cp_async_wait<0>() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.wait_all;\n" ::);
+#else
+#endif
+}
+
+__device__ __forceinline__ float clip(float v, float mmin, float mmax) {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  return fminf(mmax, fmaxf(v, mmin));
+#else
+#endif
+}
+
+__device__ __forceinline__ __nv_bfloat16 clip(__nv_bfloat16 v,
+                                              __nv_bfloat16 mmin,
+                                              __nv_bfloat16 mmax) {
+  return __hmin(mmax, __hmax(v, mmin));
+}
+
+__device__ __forceinline__ __nv_bfloat162 clip(__nv_bfloat162 v,
+                                               __nv_bfloat162 mmin,
+                                               __nv_bfloat162 mmax) {
+  return __hmin2(mmax, __hmax2(v, mmin));
+}
+
+// We use the following values for fp8 min/max:
+//  __nv_fp8_e4m3 = (-448, +448)
+//  __nv_fp8_e4m3uz = (-240.0, +240.0)
+// It is currently assumed that only
+template <class T>
+constexpr __nv_bfloat16 get_fp8_max() {
+  static_assert(std::is_same_v<T, c10::Float8_e4m3fn> ||
+                std::is_same_v<T, c10::Float8_e4m3fnuz>);
+  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 17376});
+  } else {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 17264});
+  }
+}
+
+template <class T>
+constexpr __nv_bfloat16 get_fp8_min() {
+  static_assert(std::is_same_v<T, c10::Float8_e4m3fn> ||
+                std::is_same_v<T, c10::Float8_e4m3fnuz>);
+  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 50144});
+  } else {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032});
+  }
+}
+#ifndef USE_ROCM
+template <typename fp8_type, int32_t NUM_WARPS, typename Idx_t,
+          int NUM_PARALLEL_TOKENS, bool USE_UE8M0, int GROUP_SIZE = 128,
+          int NUM_STAGES = 3>
+__global__ void silu_mul_fp8_quant_deep_gemm_kernel(
+    const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
+    float* __restrict__ _y_s, const int32_t* __restrict__ counts,
+
+    // sizes
+    int H, int G,
+
+    // strides (in elements)
+    Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
+    Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
+    Idx_t stride_ys_g, Idx_t stride_counts_e) {
+  static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
+  static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
+  // We assign EPS with its 16-bit unsigned counterpart to allow constexpr.
+  static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
+
+  // We pack 8 16-bit bfloat16 values into a 128-bit __int128_t.
+  static constexpr int32_t BFLOAT16_PER_GROUP = 8;
+
+  // We split the shared memory in half, corresponding to gate and up matrices:
+  // [...gate_i, ...up_i]  where 0 <= i < stages.
+  static constexpr int32_t S_NUM_128 =
+      2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES;
+  static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE;
+  static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2;
+  static constexpr int32_t S_NUM_64 = S_NUM_128 * 2;
+  __shared__ __int128_t __align__(16) s_buff_128[S_NUM_128];
+
+  const int32_t tid = threadIdx.x;
+  const int32_t warp_id = tid / WARP_SIZE;
+  const int32_t lane_id = tid % WARP_SIZE;
+
+  auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128);
+
+  // block handles one (expert e, group g)
+  int32_t pid = blockIdx.x;
+  int32_t e = pid / G;
+  int32_t g = pid % G;
+
+  const int32_t n_tokens = counts[e * stride_counts_e];
+
+  if (!n_tokens) {
+    return;  // Exit ASAP.
+  }
+
+  const Idx_t stride_i_t_128 = stride_i_t / 8u;
+
+  int32_t n_tokens_lower, n_tokens_upper;
+
+  // Each block i iterates over tokens of a slice of n_tokens =
+  // expert_counts[i], with the size of chunk being
+  // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of
+  // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling.
+  if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) {
+    // Specialize this, but can be likely fused.
+    if (blockIdx.y >= NUM_PARALLEL_TOKENS) {
+      return;
+    }
+    n_tokens_lower = blockIdx.y;
+    n_tokens_upper = blockIdx.y + 1;
+  } else {
+    auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS;
+    auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS;
+    auto calc_id = [&](int32_t id) {
+      if (id < residual) {
+        return min(n_tokens, id * (chunk_size + 1));
+      } else {
+        return min(n_tokens, id * chunk_size + residual);
+      }
+    };
+    n_tokens_lower = calc_id(blockIdx.y);
+    n_tokens_upper = calc_id(blockIdx.y + 1);
+  }
+
+  if (n_tokens_lower >= n_tokens_upper) {
+    return;
+  }
+
+  // We do calculations here, using constexpr wherever possible.
+  const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h;
+  const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g;
+  const Idx_t base_yq =
+      e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h;
+  Idx_t gate_off_128 = (base_i / static_cast<Idx_t>(8u));
+  auto input_128_ptr = reinterpret_cast<const __int128_t*>(_input);
+  auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) +
+                      stride_i_t_128 * n_tokens_lower;
+  auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u;
+  auto y_s_ptr =
+      _y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t;
+  auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE +
+                 stride_yq_t * n_tokens_lower + 4 * lane_id;
+  int32_t t_load = n_tokens_lower, load_stage_id = 0;
+  auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT);
+  auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u;
+  int32_t stage_offset{};
+
+  static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2);
+  static constexpr int32_t LOAD_STAGE_MOD =
+      NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2);
+
+  // Two halves of all threads in a block conduct global loads for gate and up,
+  // repsectively.
+  auto load_and_advance_y_pred = [&] {
+    if (t_load < n_tokens_upper) {
+      auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset;
+      auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset;
+
+      // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
+      // unnecessary ALU ops.
+      stage_offset += LOAD_STAGE_SIZE;
+      stage_offset %= LOAD_STAGE_MOD;
+
+      if (tid < HALF_THREAD_COUNT) {
+        cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr);
+        gate_128_ptr += stride_i_t_128;
+      } else {
+        cp_async4(s_up_stage_128_staged_ptr, up_128_ptr);
+        up_128_ptr += stride_i_t_128;
+      }
+      ++t_load;
+      ++load_stage_id;
+    }
+    // We fence even if there is nothing to load to simplify pipelining.
+    cp_async_fence();
+  };
+
+  #pragma unroll
+  for (int i = 0; i < NUM_STAGES - 1; i++) {
+    load_and_advance_y_pred();
+  }
+
+  __int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>(
+                              s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) +
+                          lane_id;
+  __int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2;
+
+  static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u;
+  static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES;
+
+  int32_t compute_pipeline_offset_64 = 0;
+
+  for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
+    __nv_bfloat16 y_max_bf16 = EPS;
+    __nv_bfloat162 results_bf162[2];
+
+    cp_async_wait<NUM_STAGES - 2>();
+    __syncthreads();
+
+    // We double-buffer pipelined loads so that the next load will
+    // concurrently run with compute without overwrites.
+    load_and_advance_y_pred();
+
+    auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64;
+    auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64;
+
+    // STAGE_SIZE must also be constexpr!
+    compute_pipeline_offset_64 += STAGE_SIZE;
+    compute_pipeline_offset_64 %= STAGE_MOD;
+
+    // Each thread loads (gate/up) 2X 4X bfloat16 values into registers.
+    __int64_t gate64 = *s_gate_compute_64;
+    __nv_bfloat162* s_gate_compute_32 =
+        reinterpret_cast<__nv_bfloat162*>(&gate64);
+
+    __int64_t up64 = *s_up_compute_64;
+    __nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64);
+
+  #pragma unroll
+    for (int i = 0; i < 2; i++) {
+      // For silu, we make sure that div is emitted.
+      float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i]));
+      results_bf162[i] = __float22bfloat162_rn(gate);
+    }
+
+  #pragma unroll
+    for (int i = 0; i < 2; i++) {
+      results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]);
+    }
+
+    auto _y_max2 =
+        __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
+
+    y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
+
+    // An entire group is assigned to a single warp, so a simple warp reduce
+    // is used.
+    __nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max;
+
+    if constexpr (USE_UE8M0) {
+      y_s = hexp2(hceil(hlog2(y_s)));
+    }
+
+    auto inv_y = __float2bfloat16_rn(1.f) / y_s;
+
+    auto y_s2 = make_bfloat162(inv_y, inv_y);
+
+  #pragma unroll
+    for (int32_t i = 0; i < 2; ++i) {
+      results_bf162[i] =
+          clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min),
+               __bfloat162bfloat162(fp8_max));
+    }
+
+    auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]);
+    *reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4;
+    y_q_ptr += stride_yq_t;
+
+    if (lane_id == 0) {
+      *y_s_ptr = y_s;
+      y_s_ptr += stride_ys_t;
+    }
+  }
+}
+#endif
+
 }  // namespace vllm

 // Launch activation, gating, and quantize kernel.
@ -119,3 +470,117 @@ void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
  TORCH_CHECK(input.size(-1) % 2 == 0);
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
 }
+
+void silu_mul_fp8_quant_deep_gemm_cuda(
+    const at::Tensor& input,   // (E, T, 2*H)
+    const at::Tensor& counts,  // (E)
+    at::Tensor& y_q,           // (E, T, H) [OUT]
+    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
+    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) {
+#ifndef USE_ROCM
+  // This kernel relies heavily on cp.async and fp8 support.
+  // This kernel currently only supports H % 128 == 0 and assumes a
+  // fixed GROUP_SIZE of 128.
+  TORCH_CHECK(input.dtype() == torch::kBFloat16);
+  TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
+              y_q.dtype() == torch::kFloat8_e4m3fnuz);
+  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
+  TORCH_CHECK(input.size(-1) % 256 == 0);
+
+  // Check that num_parallel_tokens is of power of 2 and between 1 and 64.
+  TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64);
+  TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1)));
+
+  using Idx_t = int64_t;
+
+  Idx_t E = input.size(0);
+  Idx_t T = input.size(1);
+  Idx_t H = input.size(2) / 2;
+  Idx_t stride_i_e = input.stride(0);
+  Idx_t stride_i_t = input.stride(1);
+  Idx_t stride_i_h = input.stride(2);
+  Idx_t stride_yq_e = y_q.stride(0);
+  Idx_t stride_yq_t = y_q.stride(1);
+  Idx_t stride_yq_h = y_q.stride(2);
+  Idx_t stride_ys_e = y_s.stride(0);
+  Idx_t stride_ys_t = y_s.stride(1);
+  Idx_t stride_ys_g = y_s.stride(2);
+
+  Idx_t stride_counts_e = counts.stride(0);
+
+  static constexpr int GROUP_SIZE = 128;
+
+  #define KERNEL_FN                                                         \
+    if (use_ue8m0) {                                                        \
+      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
+                                                NUM_PARALLEL_TOKENS, true>  \
+          <<<grid, block, 0, stream>>>(                                     \
+              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
+              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
+              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
+              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
+              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
+              stride_counts_e);                                             \
+    } else {                                                                \
+      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
+                                                NUM_PARALLEL_TOKENS, false> \
+          <<<grid, block, 0, stream>>>(                                     \
+              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
+              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
+              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
+              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
+              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
+              stride_counts_e);                                             \
+    }
+
+  #define KERNEL_CALL_H                                       \
+    if (H % (4 * GROUP_SIZE) == 0) {                          \
+      static constexpr int NUM_WARPS = 4;                     \
+      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
+      KERNEL_FN                                               \
+    } else {                                                  \
+      static constexpr int NUM_WARPS = 1;                     \
+      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
+      KERNEL_FN                                               \
+    }
+
+  #define KERNEL_CALL_TOP_LEVEL                      \
+    if (num_parallel_tokens == 1) {                  \
+      static constexpr int NUM_PARALLEL_TOKENS = 1;  \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 2) {           \
+      static constexpr int NUM_PARALLEL_TOKENS = 2;  \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 4) {           \
+      static constexpr int NUM_PARALLEL_TOKENS = 4;  \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 8) {           \
+      static constexpr int NUM_PARALLEL_TOKENS = 8;  \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 16) {          \
+      static constexpr int NUM_PARALLEL_TOKENS = 16; \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 32) {          \
+      static constexpr int NUM_PARALLEL_TOKENS = 32; \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 64) {          \
+      static constexpr int NUM_PARALLEL_TOKENS = 64; \
+      KERNEL_CALL_H                                  \
+    }
+
+  Idx_t G;
+  dim3 block, grid;
+  auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) {
+    G = H / Idx_t(group_size * num_warps);
+    grid = dim3(E * G, _num_parallel_tokens);
+    block = dim3(num_warps * WARP_SIZE);
+  };
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(),
+                          "silu_mul_fp8_quant_deep_gemm_kernel",
+                          [&] { KERNEL_CALL_TOP_LEVEL });
+
+#endif
+}
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -7,17 +7,10 @@

 #include <cmath>

+#include "../../cub_helpers.h"
 #include "../../dispatch_utils.h"
 #include "../vectorization_utils.cuh"

-#ifndef USE_ROCM
-  #include <cub/cub.cuh>
-  #include <cub/util_type.cuh>
-#else
-  #include <hipcub/hipcub.hpp>
-  #include <hipcub/util_type.hpp>
-#endif
-
 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
  static constexpr auto i8_min =
@ -173,7 +166,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
      });
  using BlockReduce = cub::BlockReduce<float, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
-  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
+  float block_max = BlockReduce(tmp).Reduce(thread_max, CubMaxOp{}, blockDim.x);
  __shared__ float absmax;
  if (tid == 0) {
    absmax = block_max;
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@ -14,9 +14,6 @@
 #include "cutlass/epilogue/dispatch_policy.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"

-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
-
 #include "cutlass_gemm_caller.cuh"

 namespace vllm {
@ -149,6 +146,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,

  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;
+  using ElementBlockScale = typename Gemm::ElementBlockScale;

  int32_t m = a.size(0), n = b.size(1), k = a.size(1);

@ -169,26 +167,29 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));

-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
-  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());

-  auto mainloop_args = [&](){
-    // layout_SFA and layout_SFB cannot be swapped since they are deduced.
-    if (swap_ab) {
-      return typename GemmKernel::MainloopArguments{
-          b_ptr,        b_stride,   a_ptr,        a_stride,
-          b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
-      };
-    }
-    else {
-      return typename GemmKernel::MainloopArguments{
-          a_ptr,        a_stride,   b_ptr,        b_stride,
-          a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
-      };
-    }
-  }();
+  typename GemmKernel::MainloopArguments mainloop_args{};
+  mainloop_args.layout_SFA = layout_SFA;
+  mainloop_args.layout_SFB = layout_SFB;
+  if (swap_ab) {
+    mainloop_args.ptr_A = b_ptr;
+    mainloop_args.dA = b_stride;
+    mainloop_args.ptr_B = a_ptr;
+    mainloop_args.dB = a_stride;
+    mainloop_args.ptr_SFA = b_scales_ptr;
+    mainloop_args.ptr_SFB = a_scales_ptr;
+  } else {
+    mainloop_args.ptr_A = a_ptr;
+    mainloop_args.dA = a_stride;
+    mainloop_args.ptr_B = b_ptr;
+    mainloop_args.dB = b_stride;
+    mainloop_args.ptr_SFA = a_scales_ptr;
+    mainloop_args.ptr_SFB = b_scales_ptr;
+  }
  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
@ -14,9 +14,6 @@
 #include "cutlass/epilogue/dispatch_policy.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"

-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
-
 #include "cutlass_gemm_caller.cuh"

 namespace vllm {
@ -128,6 +125,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,

  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;
+  using ElementBlockScale = typename Gemm::ElementBlockScale;

  int32_t m = a.size(0), n = b.size(1), k = a.size(1);

@ -146,17 +144,20 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
  LayoutSFB layout_SFB = 
      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));

-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
-  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());

-  auto mainloop_args = [&](){
-    return typename GemmKernel::MainloopArguments{
-        a_ptr,        a_stride,   b_ptr,        b_stride,
-        a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
-    };
-  }();
+  typename GemmKernel::MainloopArguments mainloop_args{};
+  mainloop_args.ptr_A = a_ptr;
+  mainloop_args.dA = a_stride;
+  mainloop_args.ptr_B = b_ptr;
+  mainloop_args.dB = b_stride;
+  mainloop_args.ptr_SFA = a_scales_ptr;
+  mainloop_args.layout_SFA = layout_SFA;
+  mainloop_args.ptr_SFB = b_scales_ptr;
+  mainloop_args.layout_SFB = layout_SFB;
  auto prob_shape = cute::make_shape(m, n, k, 1);

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@ -13,27 +13,18 @@
 #include "cutlass/epilogue/dispatch_policy.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"

-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
-
 #include "cutlass_gemm_caller.cuh"

 namespace vllm {

 using namespace cute;

-template <typename SchedulerType, typename OutType, int GroupSizeM_,
-          int GroupSizeN_, int GroupSizeK_, int TileSizeM_ = 128,
-          class ClusterShape = Shape<_1, _2, _1>>
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler>
 struct cutlass_3x_gemm_fp8_blockwise {
-  using GroupSizeM = Int<GroupSizeM_>;
-  using GroupSizeN = Int<GroupSizeN_>;
-  using GroupSizeK = Int<GroupSizeK_>;
-  using TileSizeM = Int<TileSizeM_>;
-
-  static_assert(TileSizeM_ % GroupSizeM_ == 0,
-                "TileSizeM must be a multiple of GroupSizeM");
-
  using ElementAB = cutlass::float_e4m3_t;

  using ElementA = ElementAB;
@ -45,52 +36,67 @@ struct cutlass_3x_gemm_fp8_blockwise {
  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;

  using ElementD = OutType;
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using LayoutD = cutlass::layout::RowMajor;
  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;

-  using ElementC = void;
-  using StrideC = StrideD;
+  using ElementC = void; // TODO: support bias
+  using LayoutC = LayoutD;
  static constexpr int AlignmentC = AlignmentD;

  using ElementAccumulator = float;
-  using ElementBlockScale = float;
  using ElementCompute = float;
+  using ElementBlockScale = float;
+
+  using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
  using ArchTag = cutlass::arch::Sm90;
  using OperatorClass = cutlass::arch::OpClassTensorOp;
-  using TileShape = Shape<TileSizeM, GroupSizeN, GroupSizeK>;

-  using KernelSchedule = cutlass::gemm::
-      KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
-          GroupSizeM_>;
-  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using ElementScalar = float;
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      ElementD,
+      LayoutD,
+      AlignmentD,
+      EpilogueScheduler,
+      DefaultOperation
+  >::CollectiveOp;

-  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
-      cutlass::epilogue::fusion::Sm90AccFetch>;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
-          ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC,
-          ElementD, StrideD, AlignmentD, EpilogueSchedule,
-          StoreEpilogueCompute>::CollectiveOp;
-
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB,
-          LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-              sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          KernelSchedule>::CollectiveOp;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      cute::tuple<LayoutA, LayoutSFA>,
+      AlignmentA,
+      ElementB,
+      cute::tuple<LayoutB, LayoutSFB>,
+      AlignmentB,
+      ElementAccumulator,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopScheduler
+  >::CollectiveOp;

  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      SchedulerType>>;
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;

  struct GemmKernel : public KernelType {};
-
-  using StrideA = typename GemmKernel::StrideA;
-  using StrideB = typename GemmKernel::StrideB;
 };

 template <typename Gemm>
@ -99,76 +105,58 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
                                   torch::Tensor const& a_scales,
                                   torch::Tensor const& b_scales) {
  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutSFA = typename Gemm::LayoutSFA;
+  using LayoutSFB = typename Gemm::LayoutSFB;
+  using ScaleConfig = typename Gemm::ScaleConfig;

  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;
+  using ElementBlockScale = typename Gemm::ElementBlockScale;

-  auto prob_shape = c3x::get_problem_shape(a, b);
-  int32_t m = get<0>(prob_shape), n = get<1>(prob_shape),
-          k = get<2>(prob_shape);
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);

-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
+  TORCH_CHECK(m % 4 == 0, "m must be divisible by 4");

-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
+  StrideA a_stride;
+  StrideB b_stride;
+  StrideC c_stride;
+  a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));

-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+  LayoutSFA layout_SFA = 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = 
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));

-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
-  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());

-  // Check is the t is contiguous and is 1D or 2D with one of the dimensions
-  // being 1 (i.e. a row or column vector)
-  auto is_contiguous_vector = [](const torch::Tensor& t) {
-    auto t_sizes = t.sizes();
-    return t.is_contiguous() &&
-           (t.dim() == 1 ||
-            (t.dim() == 2 &&
-             *std::min_element(t_sizes.begin(), t_sizes.end()) == 1));
-  };
-
-  // TODO(lucas): lets clean-up the kernel so that we pass in Strides so
-  //  we don't have to deal with enforcing implicit layouts
-  TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value);
-  TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value);
-  TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales),
-              "a_scales must be M major");
-  TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value);
-  TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value);
-  TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales),
-              "b_scales must be K major");
-  typename GemmKernel::MainloopArguments mainloop_args{
-      a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr};
+  typename GemmKernel::MainloopArguments mainloop_args{};
+  mainloop_args.ptr_A = a_ptr;
+  mainloop_args.dA = a_stride;
+  mainloop_args.ptr_B = b_ptr;
+  mainloop_args.dB = b_stride;
+  mainloop_args.ptr_SFA = a_scales_ptr;
+  mainloop_args.layout_SFA = layout_SFA;
+  mainloop_args.ptr_SFB = b_scales_ptr;
+  mainloop_args.layout_SFB = layout_SFB;
+  auto prob_shape = cute::make_shape(m, n, k, 1);

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
  typename GemmKernel::EpilogueArguments epilogue_args{
      {}, c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::TileSchedulerArguments scheduler;
-
-  static constexpr bool UsesStreamKScheduler =
-      cute::is_same_v<typename GemmKernel::TileSchedulerTag,
-                      cutlass::gemm::StreamKScheduler>;
-
-  if constexpr (UsesStreamKScheduler) {
-    using DecompositionMode = typename cutlass::gemm::kernel::detail::
-        PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
-    using ReductionMode = typename cutlass::gemm::kernel::detail::
-        PersistentTileSchedulerSm90StreamKParams::ReductionMode;
-
-    scheduler.decomposition_mode = DecompositionMode::StreamK;
-    scheduler.reduction_mode = ReductionMode::Nondeterministic;
-  }
-
  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
-                                       epilogue_args, scheduler);
+                                       epilogue_args);
 }

 template <typename OutType>
@ -177,18 +165,12 @@ void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
                                              torch::Tensor const& b,
                                              torch::Tensor const& a_scales,
                                              torch::Tensor const& b_scales) {
-  auto k = a.size(1);
-  auto n = b.size(1);
-
-  if (k > 3 * n) {
-    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        cutlass::gemm::StreamKScheduler, OutType, 1, 128, 128>>(
-        out, a, b, a_scales, b_scales);
-  } else {
-    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        cutlass::gemm::PersistentScheduler, OutType, 1, 128, 128>>(
-        out, a, b, a_scales, b_scales);
-  }
+  // TODO: better heuristics
+  cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+      OutType, 1, 128, 128, Shape<_128, _128, _128>,
+      Shape<_1, _2, _1>, cutlass::epilogue::TmaWarpSpecializedCooperative,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum>>(
+      out, a, b, a_scales, b_scales);
 }

 }  // namespace vllm
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
@ -32,7 +32,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
    TORCH_CHECK(a_scales.dim() == 2, "a scale must be 2d tensor.");
    TORCH_CHECK(b_scales.dim() == 2, "b scale must be 2d tensor.");
    int32_t version_num = get_sm_version_num();
-    if (version_num >= 100) {
+    if (version_num >= 90) {
      TORCH_CHECK(
          a.size(0) == a_scales.size(0) &&
              cuda_utils::ceil_div(a.size(1), int64_t(128)) == a_scales.size(1),
@ -41,32 +41,6 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
          cuda_utils::ceil_div(b.size(0), int64_t(128)) == b_scales.size(0) &&
              cuda_utils::ceil_div(b.size(1), int64_t(128)) == b_scales.size(1),
          "b_scale_group_shape must be [128, 128].");
-    } else {
-      // TODO: Remove this after using cutlass sm90 blockwise scaling gemm
-      // kernel, or introducing ceil_div to the load_init() of mainloop.
-      using GroupShape = std::array<int64_t, 2>;
-      auto make_group_shape = [](torch::Tensor const& x,
-                                 torch::Tensor const& s) -> GroupShape {
-        TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
-        return {cuda_utils::ceil_div(x.size(0), s.size(0)),
-                cuda_utils::ceil_div(x.size(1), s.size(1))};
-      };
-
-      GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
-      GroupShape b_scale_group_shape = make_group_shape(b, b_scales);
-
-      // 1x128 per-token group scales for activations
-      // 128x128 blockwise scales for weights
-      TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} &&
-                   b_scale_group_shape == GroupShape{128, 128} &&
-                   a.dtype() == torch::kFloat8_e4m3fn &&
-                   b.dtype() == torch::kFloat8_e4m3fn),
-                  "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n"
-                  "a_scale_group_shape must be [1, 128]. Got: [",
-                  a_scale_group_shape[0], ", ", a_scale_group_shape[1],
-                  "]\n"
-                  "b_scale_group_shape must be [128, 128]. Got: [",
-                  b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
    }

    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@ -1,15 +1,10 @@
 #include "common.cuh"
 #include "dispatch_utils.h"
+#include "../../cub_helpers.h"
 #include "../vectorization_utils.cuh"
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/Exceptions.h>

-#ifndef USE_ROCM
-  #include <cub/cub.cuh>
-#else
-  #include <hipcub/hipcub.hpp>
-#endif
-
 namespace vllm {

 template <typename scalar_t, typename fp8_type>
@ -116,7 +111,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(
  using BlockReduce = cub::BlockReduce<float, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
  const float block_max =
-      BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x);
+      BlockReduce(tmp).Reduce(absmax_val, CubMaxOp{}, blockDim.x);

  __shared__ float token_scale;
  if (tid == 0) {
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@ -5,7 +5,9 @@

 #include <cmath>

-#ifdef USE_ROCM
+#ifndef USE_ROCM
+  #include "nvidia/quant_utils.cuh"
+#else
  #include "amd/quant_utils.cuh"
 #endif

@ -48,7 +50,9 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
  float r =
      fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
 #ifndef USE_ROCM
-  return static_cast<fp8_type>(r);
+  // Use hardware cvt instruction for fp8 on nvidia
+  // Currently only support fp8_type = c10::Float8_e4m3fn
+  return fp8::vec_conversion<fp8_type, float>(r);
 #else
  // Use hardware cvt instruction for fp8 on rocm
  return fp8::cvt_c10<fp8_type>(r);
--- a/csrc/quantization/fp8/nvidia/quant_utils.cuh
+++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh
@ -12,13 +12,26 @@ namespace vllm {
 namespace fp8 {
  #ifdef ENABLE_FP8

-    #if 0  // Disable the following code to reduce the binary size.
 template <typename Tout, typename Tin>
-__inline__ __device__ Tout
-vec_conversion(const Tin &x, const __nv_fp8_interpretation_t fp8_type) {
+__inline__ __device__ Tout vec_conversion(
+    const Tin& x, const __nv_fp8_interpretation_t fp8_type = __NV_E4M3) {
  return x;
 }

+// float -> c10::Float8_e4m3fn
+template <>
+__inline__ __device__ c10::Float8_e4m3fn
+vec_conversion<c10::Float8_e4m3fn, float>(
+    const float& a, const __nv_fp8_interpretation_t fp8_type) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  return static_cast<c10::Float8_e4m3fn>(a);
+    #else
+  return c10::Float8_e4m3fn(__nv_cvt_float_to_fp8(a, __NV_SATFINITE, fp8_type),
+                            c10::Float8_e4m3fn::from_bits());
+    #endif
+}
+
+    #if 0  // Disable the following code to reduce the binary size.
 // fp8 -> half
 template <>
 __inline__ __device__ uint16_t vec_conversion<uint16_t, uint8_t>(
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@ -8,11 +8,7 @@
 #include "quantization/utils.cuh"
 #include "quant_conversions.cuh"

-#ifndef USE_ROCM
-  #include <cub/cub.cuh>
-#else
-  #include <hipcub/hipcub.hpp>
-#endif
+#include "../../cub_helpers.h"

 namespace vllm {

@ -36,7 +32,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+  ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x);

  __shared__ float s_rms;
  if (threadIdx.x == 0) {
@ -73,7 +69,7 @@ __device__ void compute_dynamic_per_token_scales(
  __shared__ typename BlockReduce::TempStorage reduceStore;
  block_absmax_val_maybe =
      BlockReduce(reduceStore)
-          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+          .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);

  __shared__ float s_token_scale;
  if (threadIdx.x == 0) {
@ -169,7 +165,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
-  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+  ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x);

  __shared__ float s_rms;
  if (threadIdx.x == 0) {
@ -240,7 +236,7 @@ __device__ void compute_dynamic_per_token_scales(
  __shared__ typename BlockReduce::TempStorage reduceStore;
  block_absmax_val_maybe =
      BlockReduce(reduceStore)
-          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+          .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);

  __shared__ float s_token_scale;
  if (threadIdx.x == 0) {
--- a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
+++ b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
@ -0,0 +1,817 @@
+// clang-format off
+// Adapted from: https://github.com/meta-pytorch/applied-ai/blob/main/kernels/cuda/inference/hadamard_transform/hadamard_transform_cuda.cu
+
+/***********
+Copyright 2024 Meta
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+***********/
+
+#include <torch/all.h>
+#include <stdint.h>
+#include <cuda_runtime.h>
+#include <mma.h>
+#include <cuda/annotated_ptr>
+#include <c10/cuda/CUDAException.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "core/registration.h"
+#include "dispatch_utils.h"
+
+namespace hadacore {
+
+#ifndef __CUDACC__
+#define __launch_bounds__(x,y)
+#endif
+
+#define MAX_WARPS_PER_SM 48
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+using b16 = uint16_t;
+using b32 = uint32_t;
+
+constexpr int launch_configs_big[7][3] = {
+    // default
+    {2, 1, 24},
+    {2, 2, 16}, 
+    {2, 4, 8}, 
+    {2, 8, 4}, 
+    {2, 16, 3},
+    {4, 16, 2},
+    {8, 16, 1}
+    // // extra coalescing
+    // {2, 1, 24},
+    // {2, 2, 16}, 
+    // {2, 4, 8}, 
+    // {2, 8, 4}, 
+    // {4, 8, 3},
+    // {8, 8, 2},
+    // {16, 8, 1}
+    // // less coalescing
+    // {2, 1, 24},
+    // {2, 2, 16}, 
+    // {2, 4, 8}, 
+    // {2, 8, 4}, 
+    // {1, 32, 1},
+    // {2, 32, 1},
+    // {4, 32, 1}
+};
+
+// a 4x2, b 2x2, c 2x2
+template <torch::ScalarType dtype>
+__device__ __forceinline__ void mma_m16_n8_k16_b16_b16_b16_noacc(b32 a0, b32 a1, b32 a2, b32 a3, b32 b0, b32 b1, b32& c0, b32& c1){
+    static_assert(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16);
+    // d, a, b, c
+    b32 zero = 0;
+    if constexpr(dtype == torch::ScalarType::Half) {
+        asm (
+            "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+            "{%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%8, %9};\n\t"
+            : "=r"(c0), "=r"(c1) : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(b0), "r"(b1), "r"(zero), "r"(zero)
+        );
+    } else {
+        b32 temp0, temp1, temp2, temp3;
+        asm (
+            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n\t"
+            : "=r"(temp0), "=r"(temp1), "=r"(temp2), "=r"(temp3) : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(b0), "r"(b1), "r"(zero), "r"(zero), "r"(zero), "r"(zero)
+        );
+        asm ("cvt.rn.bf16x2.f32 %0, %1, %2;\n\t" : "=r"(c0) : "r"(temp1), "r"(temp0));
+        asm ("cvt.rn.bf16x2.f32 %0, %1, %2;\n\t" : "=r"(c1) : "r"(temp3), "r"(temp2));
+    }
+}
+
+// a 4x2, b 4x2, c 4x2
+template <torch::ScalarType dtype>
+__device__ __forceinline__ void mma_m16_n16_k16_b16_b16_b16_noacc(b32 a0, b32 a1, b32 a2, b32 a3, b32 b0, b32 b1, b32 b2, b32 b3, b32& c0, b32& c1, b32& c2, b32& c3){
+    mma_m16_n8_k16_b16_b16_b16_noacc<dtype>(a0, a1, a2, a3, b0, b1, c0, c1);
+    mma_m16_n8_k16_b16_b16_b16_noacc<dtype>(a0, a1, a2, a3, b2, b3, c2, c3);
+}
+
+__device__ __forceinline__ void matrix_transpose_m8_n8_b16_inplace(b32& a0) {
+    asm (
+        "movmatrix.sync.aligned.m8n8.trans.b16 "
+        "%0, %1;\n\t"
+        : "=r"(a0) : "r"(a0)
+    );
+}
+
+#define p_p(i) ((val_1p[i] & 0x0000FFFF) | val_1p[i] << 16)
+#define p_n(i) ((val_1p[i] & 0x0000FFFF) | val_1n[i] << 16)
+#define n_p(i) ((val_1n[i] & 0x0000FFFF) | val_1p[i] << 16)
+#define n_n(i) ((val_1n[i] & 0x0000FFFF) | val_1n[i] << 16)
+
+template<int64_t num_chunks, int64_t warps_per_block, int64_t log_had_size, int64_t blocks_per_sm, bool enable_mask, torch::ScalarType dtype>
+__global__ void __launch_bounds__(32 * warps_per_block, blocks_per_sm)
+// a is column major, b is row major
+hadamard_transform_kernel(b16* a, b16* out, int total_num_chunks) {
+    static_assert(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16, "Only fp16 and bf16 supported currently");
+
+    b32 b_frag_all[num_chunks][4]; // for all chunks, holds matrix fragment (which takes 4 regs of b16x2 * 32 threads)
+
+    int64_t blockid = blockIdx.x * warps_per_block + threadIdx.x / 32;
+    int64_t threadid = threadIdx.x % 32;
+    extern __shared__ b32 bfrag_arr[]; // num_chunks * warps_per_block * 128
+    int64_t real_num_chunks = ((blockid + 1) * num_chunks) > total_num_chunks ? (total_num_chunks - (blockid * num_chunks)) : num_chunks;
+    int64_t diff_num_chunks = real_num_chunks - num_chunks;
+
+    b32* a_start_ptr = (b32*) (a + blockid * num_chunks * 256); // offset a to where this warp starts
+    b32* out_start_ptr = (b32*) (out + blockid * num_chunks * 256);
+    b32* a_ptr = a_start_ptr + threadid * 4;
+    b32* b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * 128 + threadid * 4;
+
+    #if (__CUDA_ARCH__ < 900) // SM80, SM89
+    uint64_t cache_policy;
+    asm volatile(
+        "createpolicy.fractional.L2::evict_first.b64 %0, 1.0;\n"
+        : "=l"(cache_policy)
+    );
+    #endif
+
+    #pragma unroll
+    for (int64_t k = 0; k < num_chunks; k++) {
+        size_t shared_ptr = __cvta_generic_to_shared(b_frag_ptr);
+        #if (__CUDA_ARCH__ >= 900) // SM90
+            asm volatile(
+                "cp.async.cg.shared.global [%0], [%1], 16;\n"
+                "cp.async.commit_group;\n"
+                :: "l"(shared_ptr), "l"(a_ptr)
+            );
+        #else // SM80, SM89
+            asm volatile(
+                "cp.async.cg.shared.global.L2::cache_hint.L2::256B [%0], [%1], 16, %2;\n"
+                "cp.async.commit_group;\n"
+                :: "l"(shared_ptr), "l"(a_ptr), "l"(cache_policy)
+            );
+        #endif
+
+        a_ptr += 128;
+        b_frag_ptr += 128;
+    }
+
+    // generate hadamard 16x16 (up to 2 of them)
+    constexpr b16 fp16_1p[4] = {0b0011100110101000, 0b0011100000000000, 0b0011010110101000, 0b0011010000000000};
+    constexpr b16 fp16_1n[4] = {0b1011100110101000, 0b1011100000000000, 0b1011010110101000, 0b1011010000000000};
+    constexpr b16 bf16_1p[4] = {0b0011111100110101, 0b0011111100000000, 0b0011111010110101, 0b0011111010000000};
+    constexpr b16 bf16_1n[4] = {0b1011111100110101, 0b1011111100000000, 0b1011111010110101, 0b1011111010000000};
+
+    #define val_type_1p(i) (((dtype) == torch::ScalarType::Half) ? (fp16_1p[i]) : (bf16_1p[i]))
+    #define val_type_1n(i) (((dtype) == torch::ScalarType::Half) ? (fp16_1n[i]) : (bf16_1n[i]))
+    constexpr b16 val_1p[4] = {val_type_1p(0), val_type_1p(1), val_type_1p(2), val_type_1p(3)};
+    constexpr b16 val_1n[4] = {val_type_1n(0), val_type_1n(1), val_type_1n(2), val_type_1n(3)};
+
+    constexpr b32 p_p[4] = {p_p(0), p_p(1), p_p(2), p_p(3)};
+    constexpr b32 p_n[4] = {p_n(0), p_n(1), p_n(2), p_n(3)};
+    constexpr b32 n_p[4] = {n_p(0), n_p(1), n_p(2), n_p(3)};
+    constexpr b32 n_n[4] = {n_n(0), n_n(1), n_n(2), n_n(3)};
+    const b32 had_16_p1[4][4] = {
+        {
+            0b10001000010001000010001000010001,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b10001000010001000010001000010001
+        },
+        {
+            0b11001100100010000011001100100010,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11001100100010000011001100100010
+        },
+        {
+            0b11111111101010101100110010011001,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11111111101010101100110010011001
+        },
+        {
+            0b11111111101010101100110010011001,
+            0b11111111101010101100110010011001,
+            0b11111111101010101100110010011001,
+            0b00000000010101010011001101100110
+        }
+    };
+    const b32 had_16_p2[4][4] = {
+        {
+            0b10000000010000000010000000010000,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b10000000010000000010000000010000
+        },
+        {
+            0b11000000100001000011000000100001,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11000000100001000011000000100001
+        },
+        {
+            0b11110000101001011100001110010110,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11110000101001011100001110010110
+        },
+        {
+            0b11110000101001011100001110010110,
+            0b11110000101001011100001110010110,
+            0b11110000101001011100001110010110,
+            0b00001111010110100011110001101001
+        }
+    };
+    const b32 had_16_mask[3][4] = {
+        {
+            0b10001000010001000010001000010001,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b10001000010001000010001000010001
+        },
+        {
+            0b11001100110011000011001100110011,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11001100110011000011001100110011
+        },
+        {
+            0b11111111111111111111111111111111,
+            0b00000000000000000000000000000000,
+            0b00000000000000000000000000000000,
+            0b11111111111111111111111111111111
+        }
+    };
+    b32 had_frag[8];
+    #pragma unroll
+    for (int64_t i = 0; i < 2; i++) {
+        int64_t c_log_h = (i == 0) ? MIN(4, log_had_size) : log_had_size % 4;
+        #pragma unroll
+        for (int64_t j = 0; j < 4; j++) {
+            if (c_log_h < 4) {
+                bool mask = had_16_mask[c_log_h - 1][j] & (1 << (31 - threadid));
+                if (!mask) {
+                    had_frag[i * 4 + j] = 0;
+                    continue;
+                }
+            }
+            bool pred1 = had_16_p1[c_log_h - 1][j] & (1 << (31 - threadid));
+            bool pred2 = had_16_p2[c_log_h - 1][j] & (1 << (31 - threadid));
+            b32 val = pred1 ? (pred2 ? p_p[c_log_h - 1] : p_n[c_log_h - 1]) : (pred2 ? n_p[c_log_h - 1] : n_n[c_log_h - 1]);
+            had_frag[i * 4 + j] = val;
+        }
+        if constexpr(log_had_size <= 4 || log_had_size % 4 == 0) break;
+    }
+
+    // log had size above 8, only used for above 2^8 = 256 size
+    constexpr int64_t part8_log_had_size = log_had_size - 8;
+
+    b32* a_chunk_ptr = a_start_ptr; // first chunk starts at this warp's data starts
+    b32* out_chunk_ptr = out_start_ptr;
+
+    #pragma unroll
+    for (int64_t l = 0; l < 2; l++) {
+        if constexpr(log_had_size <= 8) { // l == 0 guaranteed, redundant simplified version of else body, to help compiler warnings
+            b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * 128;
+        } else {
+            b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * (l == 0 ? 128 : (128 >> part8_log_had_size));
+        }
+
+        if (l == 1) {
+            if constexpr(log_had_size > 8) {
+                __syncthreads(); // sync between first and second iterations if above size 256
+
+                if constexpr(log_had_size >= 12) {
+                    // sizes 4k and above
+
+                    // a + threadblock offset + warp offset
+                    // can then index into all chunks owned by this warp
+                    b32* store = bfrag_arr + (128 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block));
+
+                    #pragma unroll
+                    for (int64_t j = 0; j < 4; j++) {
+                        #pragma unroll
+                        for (int64_t k = 0; k < num_chunks; k++) {
+                            // here, j represents register, and k represents 8-offset/chunk
+                            uint64_t real_chunk_num = (num_chunks - (threadid % num_chunks) + k) % num_chunks; // chunk at which you have target thread #'s data
+                            
+                            int64_t real_thread_id = (threadid / num_chunks) * num_chunks + k; // target thread #
+                            int64_t chunk_idx = 128 * real_chunk_num; // index due to fetching from another chunk (chunk in which this thread has the target thread's original data)
+                            int64_t thread_group_idx = (real_thread_id / 4) * 16; // index due to fetching from another group of num_chunk threads (since shuffle is between num_chunk threads)
+                            int64_t thread_idx = (real_thread_id % 4) * 2; // index due to original thread's position within the group of num_chunk threads
+                            int64_t reg_idx = (j / 2) * 8 + (j % 2); // index due to target register
+                            int64_t idx = chunk_idx + thread_group_idx + thread_idx + reg_idx; // final index
+
+                            // fix idx for majorness
+                            int64_t rowidx = idx % (1 << part8_log_had_size);
+                            int64_t colidx = idx >> part8_log_had_size;
+
+                            // store[rowidx * 128 + colidx] = data;
+                            b32 data = store[rowidx * 128 + colidx];
+
+                            // compiler generates excessive instructions, so we manually do the if statement
+                            #pragma unroll
+                            for (uint64_t i = 0; i < num_chunks; i++) {
+                                asm volatile (
+                                    "{\n\t"
+                                    "  .reg .pred p0;\n\t"
+                                    "  setp.eq.s64 p0, %1, %2;\n\t"
+                                    "  @p0 mov.b32 %0, %3;\n\t"
+                                    "}\n\t"
+                                    : "+r"(b_frag_all[i][j]) // Output operand %0
+                                    : "l"(real_chunk_num), "l"(i), "r"(data) // Input operands %1, %2, %3
+                                );
+                            }
+                        }
+                    }
+
+                    #pragma unroll
+                    for (int64_t j = 0; j < 4; j++) {
+                        #pragma unroll
+                        for (int64_t k = 1; k < num_chunks; k++) {
+                            int64_t threadid_contig = threadid % num_chunks;
+                            int64_t threadid_mul = threadid / num_chunks;
+                            int64_t threadid2 = (threadid_contig + num_chunks - k) % num_chunks + threadid_mul * num_chunks; // thread to give your data to
+                            b_frag_all[k][j] = __shfl_sync(0xFFFFFFFF, b_frag_all[k][j], threadid2);
+                        }
+                    }
+                }
+            }
+        }
+
+        #pragma unroll
+        for (int64_t k = 0; k < num_chunks; k++) {
+            if constexpr(enable_mask) {
+                if (k >= real_num_chunks)
+                    break;
+            }
+            if (l == 0) {
+                // bad fix for k not being recognized as a constexpr by compiler
+                // asm("cp.async.wait_group %0;\n" :: "n"(num_chunks - k - 1));
+                #define SWITCH_WAIT_ASYNC_LOAD_GROUP(i) case i: asm volatile("cp.async.wait_group %0;\n" :: "n"(num_chunks - i - 1)); break;
+                if constexpr(enable_mask) {
+                    switch(k + diff_num_chunks) {
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(0)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(1)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(2)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(3)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(4)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(5)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(6)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(7)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(8)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(9)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(10)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(11)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(12)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(13)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(14)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(15)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(16)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(17)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(18)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(19)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(20)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(21)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(22)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(23)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(24)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(25)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(26)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(27)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(28)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(29)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(30)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(31)
+                    }
+                } else {
+                    switch(k) {
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(0)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(1)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(2)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(3)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(4)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(5)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(6)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(7)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(8)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(9)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(10)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(11)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(12)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(13)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(14)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(15)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(16)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(17)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(18)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(19)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(20)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(21)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(22)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(23)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(24)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(25)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(26)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(27)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(28)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(29)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(30)
+                        SWITCH_WAIT_ASYNC_LOAD_GROUP(31)
+                    }
+                }
+            }
+
+            if (l == 0) {
+                // loading for the first iteration
+
+                // thread 0 loads  [t0r0, t16r1, t0r2, t16r3]
+                // thread 16 loads [t0r1, t16r0, t0r3, t16r2]
+                // allows full coalescing, same for t1/t17, t2/t18, etc.
+                #pragma unroll
+                for (int64_t j = 0; j < 4; j++) {
+                    int64_t reg = ((threadid & 16) == 0) ? j : (j / 2 * 2 + (1 - j % 2));
+                    int64_t real_thread_id = (reg == 0 || reg == 2) ? threadid : (threadid ^ 16);
+                    int64_t real_row = real_thread_id % 4;
+                    int64_t real_col = real_thread_id / 4;
+                    b_frag_all[k][j] = b_frag_ptr[(real_row + (reg % 2) * 4) + (real_col + (j / 2) * 8) * 8];
+                }
+
+                // for t16 swap r0/r1 and r2/r3 to have [t16r0, t0r1, t16r2, t0r3]
+                // so registers are in right order, same for t17, t18, etc.
+                if ((threadid & 16) != 0) {
+                    b32 temp = b_frag_all[k][0];
+                    b_frag_all[k][0] = b_frag_all[k][1];
+                    b_frag_all[k][1] = temp;
+
+                    temp = b_frag_all[k][2];
+                    b_frag_all[k][2] = b_frag_all[k][3];
+                    b_frag_all[k][3] = temp;
+                }
+
+                // t0 and t16 swap r1 and r3 to have their own data,
+                // same for t1/t17, t2/18, etc.
+                #pragma unroll
+                for (int64_t j = 1; j < 4; j += 2) {
+                    b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], 16);
+                }
+            } else if constexpr(log_had_size > 8) { // condition is redundant to help compiler warnings
+                if constexpr(log_had_size < 12) {
+                    // sizes 512, 1k, and 2k
+
+                    // for 512:
+                    //     thread 0 loads  [t0r0, t0r1, t16r2, t16r3]
+                    //     thread 16 loads [t0r2, t0r3, t16r0, t16r1]
+                    //     same for t1/t17, t2/t18, etc.
+                    // for 1k and 2k:
+                    //     thread 0 loads [t0r0, t0r1, t1r2, t1r3]
+                    //     thread 1 loads [t0r2, t0r3, t1r0, t1r1]
+                    //     same for t2/t3, t4/t5, etc.
+                    // allows full coalescing for 512 and 1k, 16x coalescing for 2k
+                    constexpr int64_t xor_val = log_had_size == 9 ? 16 : 1;
+
+                    #pragma unroll
+                    for (int64_t j = 0; j < 4; j++) {
+                        int64_t reg = ((threadid & xor_val) == 0) ? j : (j + 2) % 4;
+                        int64_t real_thread_id = reg < 2 ? threadid : (threadid ^ xor_val);
+                        int64_t idx = (real_thread_id / 4 * 16) + (real_thread_id % 4 * 2) + (reg / 2 * 8) + (reg % 2);
+                        int64_t rowidx = idx % (1 << part8_log_had_size);
+                        int64_t colidx = idx >> part8_log_had_size;
+                        b_frag_all[k][j] = b_frag_ptr[rowidx * 128 + colidx];
+                    }
+
+                    if ((threadid & xor_val) != 0) {
+                        b32 temp = b_frag_all[k][0];
+                        b_frag_all[k][0] = b_frag_all[k][2];
+                        b_frag_all[k][2] = temp;
+
+                        temp = b_frag_all[k][1];
+                        b_frag_all[k][1] = b_frag_all[k][3];
+                        b_frag_all[k][3] = temp;
+                    }
+
+                    #pragma unroll
+                    for (int64_t j = 2; j < 4; j++) {
+                        b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], xor_val);
+                    }
+                }
+            }
+
+            if (l == 1) {
+                // for second iteration, we load 2 consecutive b16s (1 b32) per register,
+                // but tensor core register layout requires 2 b16s that are in the
+                // same column/consecutive rows to be in the same register, so do the swap
+                b32 f0 = ((b_frag_all[k][1] & 0xFFFF) << 16) | (b_frag_all[k][0] & 0xFFFF);
+                b32 f1 = ((b_frag_all[k][3] & 0xFFFF) << 16) | (b_frag_all[k][2] & 0xFFFF);
+                b32 f2 = (b_frag_all[k][1] & 0xFFFF0000) | (b_frag_all[k][0] >> 16);
+                b32 f3 = (b_frag_all[k][3] & 0xFFFF0000) | (b_frag_all[k][2] >> 16);
+                b_frag_all[k][0] = f0;
+                b_frag_all[k][1] = f1;
+                b_frag_all[k][2] = f2;
+                b_frag_all[k][3] = f3;
+            }
+
+            #pragma unroll
+            for(int64_t i = 0, remaining_log_had_size = log_had_size - l * 8; i < 2 && remaining_log_had_size > 0; i++) {
+                int64_t had_off = ((remaining_log_had_size < 4) && !(log_had_size <= 4 || log_had_size % 4 == 0)) ? 4 : 0;
+                mma_m16_n16_k16_b16_b16_b16_noacc<dtype>(had_frag[had_off + 0], had_frag[had_off + 1], had_frag[had_off + 2], had_frag[had_off + 3], b_frag_all[k][0], b_frag_all[k][1], b_frag_all[k][2], b_frag_all[k][3], b_frag_all[k][0], b_frag_all[k][1], b_frag_all[k][2], b_frag_all[k][3]);
+
+                remaining_log_had_size -= 4;
+                if (remaining_log_had_size <= 0 && i == 0) {
+                    // TODO: consider different storing so no need for transpose
+                    matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][0]);
+                    matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][1]);
+                    matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][2]);
+                    matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][3]);
+                } else {
+                    // swap and use output directly as b_frag for next iteration as an actually free transpose
+                    b32 temp = b_frag_all[k][1];
+                    b_frag_all[k][1] = b_frag_all[k][2];
+                    b_frag_all[k][2] = temp;
+                }
+            }
+
+            if (l == 1) {
+                // invert swap from above for second iteration
+                b32 f0 = ((b_frag_all[k][2] & 0xFFFF) << 16) | (b_frag_all[k][0] & 0xFFFF);
+                b32 f1 = (b_frag_all[k][2] & 0xFFFF0000) | (b_frag_all[k][0] >> 16);
+                b32 f2 = ((b_frag_all[k][3] & 0xFFFF) << 16) | (b_frag_all[k][1] & 0xFFFF);
+                b32 f3 = (b_frag_all[k][3] & 0xFFFF0000) | (b_frag_all[k][1] >> 16);
+                b_frag_all[k][0] = f0;
+                b_frag_all[k][1] = f1;
+                b_frag_all[k][2] = f2;
+                b_frag_all[k][3] = f3;
+            }
+
+            if (l == 0) {
+                // inverse of coalesced load for first iteration to store result
+                #pragma unroll
+                for (int64_t j = 1; j < 4; j += 2) {
+                    b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], 16);
+                }
+
+                if ((threadid & 16) != 0) {
+                    b32 temp = b_frag_all[k][0];
+                    b_frag_all[k][0] = b_frag_all[k][1];
+                    b_frag_all[k][1] = temp;
+
+                    temp = b_frag_all[k][2];
+                    b_frag_all[k][2] = b_frag_all[k][3];
+                    b_frag_all[k][3] = temp;
+                }
+
+                // if only going up to 256 size, store directly back to global memory,
+                // otherwise store back to shared memory for next iteration
+                b32* store = (log_had_size <= 8) ? out_chunk_ptr : b_frag_ptr;
+
+                #pragma unroll
+                for (int64_t j = 0; j < 4; j++) {
+                    int64_t reg = ((threadid & 16) == 0) ? j : (j / 2 * 2 + (1 - j % 2));
+                    int64_t real_thread_id = (reg == 0 || reg == 2) ? threadid : (threadid ^ 16);
+                    int64_t real_row = real_thread_id % 4;
+                    int64_t real_col = real_thread_id / 4;
+                    store[(real_row + (reg % 2) * 4) + (real_col + (reg / 2) * 8) * 8] = b_frag_all[k][j];
+                }
+            } else if constexpr(log_had_size > 8) { // condition is redundant to help compiler warnings
+                if (log_had_size < 12) {
+                    // inverse of coalesced load for sizes 512, 1k and 2k to store result
+                    constexpr int xor_val = log_had_size == 9 ? 16 : 1;
+                    #pragma unroll
+                    for (int64_t j = 2; j < 4; j++) {
+                        b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], xor_val);
+                    }
+
+                    if ((threadid & xor_val) != 0) {
+                        b32 temp = b_frag_all[k][0];
+                        b_frag_all[k][0] = b_frag_all[k][2];
+                        b_frag_all[k][2] = temp;
+
+                        temp = b_frag_all[k][1];
+                        b_frag_all[k][1] = b_frag_all[k][3];
+                        b_frag_all[k][3] = temp;
+                    }
+
+                    b32* store = (b32*)(out + (blockid / warps_per_block) * (num_chunks * warps_per_block) * 256 + (256 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block) + k));
+                    #pragma unroll
+                    for (int64_t j = 0; j < 4; j++) {
+                        int64_t reg = ((threadid & xor_val) == 0) ? j : (j + 2) % 4;
+                        b32 data = b_frag_all[k][j];
+                        int64_t real_thread_id = reg < 2 ? threadid : (threadid ^ xor_val);
+                        int64_t idx = (real_thread_id / 4 * 16) + (real_thread_id % 4 * 2) + (reg / 2 * 8) + (reg % 2);
+                        int64_t rowidx = idx % (1 << part8_log_had_size);
+                        int64_t colidx = idx >> part8_log_had_size;
+                        store[rowidx * 128 + colidx] = data;
+                    }
+                }
+                // for size 4k and above, wait to process all chunks so a final store can be performed coalesced
+            }
+
+            a_chunk_ptr += 128; // (only affects first 256 size) move on to next chunk by skipping 256 elements in b16 (= 128 in b32)
+            out_chunk_ptr += 128;
+            if constexpr(log_had_size > 8) {
+                b_frag_ptr += (l == 0 ? 128 : (128 >> part8_log_had_size));
+            } else { // else is redundant, simplified version of if body, to help compiler warnings
+                b_frag_ptr += 128;
+            }
+        }
+        if (log_had_size <= 8)
+            break;
+    }
+
+    if constexpr(log_had_size >= 12) {
+        // for sizes 4k and above, perform final coalesced store after processing all chunks
+        #pragma unroll
+        for (int64_t j = 0; j < 4; j++) {
+            #pragma unroll
+            for (int64_t k = 1; k < num_chunks; k++) {
+                int64_t threadid_contig = threadid % num_chunks;
+                int64_t threadid_mul = threadid / num_chunks;
+                int64_t threadid2 = (threadid_contig + k) % num_chunks + threadid_mul * num_chunks; // thread to give your data to
+                b_frag_all[k][j] = __shfl_sync(0xFFFFFFFF, b_frag_all[k][j], threadid2);
+            }
+        }
+
+        // a + threadblock offset + warp offset
+        // can then index into all chunks owned by this warp
+        b32* store = bfrag_arr + (128 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block));
+
+        #pragma unroll
+        for (int64_t j = 0; j < 4; j++) {
+            #pragma unroll
+            for (int64_t k = 0; k < num_chunks; k++) {
+                // here, j represents register, and k represents 8-offset/chunk
+                int64_t real_chunk_num = (num_chunks - (threadid % num_chunks) + k) % num_chunks; // chunk at which you have target thread #'s data
+
+                // b32 data = b_frag_all[real_chunk_num][j]; // target thread data
+                b32 data;
+                #pragma unroll
+                for (int64_t i = 0; i < num_chunks; i++) {
+                    if (real_chunk_num == i) data = b_frag_all[i][j];
+                }
+                
+                int64_t real_thread_id = (threadid / num_chunks) * num_chunks + k; // target thread #
+                int64_t chunk_idx = 128 * real_chunk_num; // index due to fetching from another chunk (chunk in which this thread has the target thread's original data)
+                int64_t thread_group_idx = (real_thread_id / 4) * 16; // index due to fetching from another group of num_chunk threads (since shuffle is between num_chunk threads)
+                int64_t thread_idx = (real_thread_id % 4) * 2; // index due to original thread's position within the group of num_chunk threads
+                int64_t reg_idx = (j / 2) * 8 + (j % 2); // index due to target register
+                int64_t idx = chunk_idx + thread_group_idx + thread_idx + reg_idx; // final index
+
+                // fix idx for majorness
+                int64_t rowidx = idx % (1 << part8_log_had_size);
+                int64_t colidx = idx >> part8_log_had_size;
+
+                store[rowidx * 128 + colidx] = data;
+            }
+        }
+
+        __syncthreads();
+        store = ((b32*) out) + (blockid / warps_per_block) * (num_chunks * warps_per_block) * 128;
+        int4* store4 = (int4*) store;
+        int4* bfrag_arr4 = (int4*) bfrag_arr;
+        // flush smem, simply linearly write to store
+        // always divisible by 128*32b, so (32*4)*32b is ok
+        #pragma unroll
+        for (int64_t warp_off = 0; warp_off < (num_chunks * warps_per_block * 128 / 4); warp_off += 32 * warps_per_block) {
+            int64_t total_off = warp_off + threadid + (blockid % warps_per_block) * 32;
+            store4[total_off] = bfrag_arr4[total_off];
+        }
+    }
+
+}
+
+constexpr int64_t ceil_div(int64_t a, int64_t b) {
+    return (a + b - 1) / b;
+}
+
+template <torch::ScalarType dtype, int64_t chunks_per_warp, int64_t warps_per_block, int64_t log_had_size, int64_t blocks_per_sm, bool check_masking = false>
+void __forceinline__ run_kernel(b16* a_mat, b16* out, int64_t num_chunks, cudaStream_t stream) {
+    int64_t shared_size = chunks_per_warp * warps_per_block * 128 * 4;
+    dim3 block_size = 32 * warps_per_block;
+
+    #define CHECK_SHARED_LIM() {                                                                              \
+        if (shared_size > 48 * 1024) {                                                                        \
+            C10_CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536)); \
+        }                                                                                                     \
+    }                                                                                                         \
+
+    if constexpr(check_masking) {
+        if (num_chunks % (chunks_per_warp * warps_per_block) != 0) {
+            dim3 grid_size = ceil_div(ceil_div(num_chunks, chunks_per_warp), warps_per_block);
+            auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, true, dtype>;
+            CHECK_SHARED_LIM();
+            kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks);
+        } else {
+            dim3 grid_size = num_chunks / chunks_per_warp / warps_per_block;
+            auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, false, dtype>;
+            CHECK_SHARED_LIM();
+            kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks);
+        }
+    } else {
+        dim3 grid_size = num_chunks / chunks_per_warp / warps_per_block;
+        auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, false, dtype>;
+        CHECK_SHARED_LIM();
+        kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks);
+    }
+    
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <torch::ScalarType dtype>
+void run_fht(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream) {
+    int64_t num_chunks = numel / 256; // caller required to ensure divisible by 256
+    // for size 256, use (2, 1)
+    // for size 32k use (8, 16)
+    constexpr int64_t chunks_per_warp_small = 1;// 8;
+    constexpr int64_t warps_per_block_small = 1;//2;//16;
+    constexpr int64_t blocks_per_sm_small = 24;
+    constexpr int64_t chunks_per_warp_large = 2;
+    constexpr int64_t warps_per_block_large = 1;
+    constexpr int64_t blocks_per_sm_large = 24;
+
+    b16* a_mat = (b16*) a_mat_ptr;
+    b16* out = (b16*) out_ptr;
+
+    if (numel <= 256) {
+        switch (had_size) {
+            case (1<<1): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 1, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<2): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 2, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<3): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 3, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<4): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 4, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<5): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 5, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<6): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 6, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<7): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 7, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+            case (1<<8): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 8, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
+        }
+    } else {
+        switch (had_size) {
+            case (1<<1):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 1, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<2):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 2, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<3):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 3, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<4):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 4, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<5):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 5, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<6):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 6, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<7):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 7, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<8):  run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 8, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
+            case (1<<9):  run_kernel<dtype, launch_configs_big[0][0], launch_configs_big[0][1], 9 , launch_configs_big[0][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<10): run_kernel<dtype, launch_configs_big[1][0], launch_configs_big[1][1], 10, launch_configs_big[1][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<11): run_kernel<dtype, launch_configs_big[2][0], launch_configs_big[2][1], 11, launch_configs_big[2][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<12): run_kernel<dtype, launch_configs_big[3][0], launch_configs_big[3][1], 12, launch_configs_big[3][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<13): run_kernel<dtype, launch_configs_big[4][0], launch_configs_big[4][1], 13, launch_configs_big[4][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<14): run_kernel<dtype, launch_configs_big[5][0], launch_configs_big[5][1], 14, launch_configs_big[5][2]>(a_mat, out, num_chunks, stream); break;
+            case (1<<15): run_kernel<dtype, launch_configs_big[6][0], launch_configs_big[6][1], 15, launch_configs_big[6][2]>(a_mat, out, num_chunks, stream); break;
+        }
+    }
+}
+
+template void run_fht<torch::ScalarType::Half>(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream);
+template void run_fht<torch::ScalarType::BFloat16>(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream);
+
+}  // namespace hadacore
+
+constexpr bool is_power_of_two(int x) { return x && !(x & (x - 1)); }
+
+torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
+    auto dtype = x.scalar_type();
+    TORCH_CHECK(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16, "Only fp16 and bf16 supported currently");
+    TORCH_CHECK(x.is_cuda());
+    
+    const int had_size = x.size(-1);
+    TORCH_CHECK(is_power_of_two(had_size) && (had_size <= (1U << 15)),
+        "Only power of two Hadamard sizes up to 2^15 are supported, got ", had_size);
+    
+    const auto res_shape = x.sizes();
+    x = x.reshape({-1, had_size});
+    
+    auto numel = x.numel();
+    if (numel % 256 != 0) {
+        x = torch::nn::functional::pad(x, torch::nn::functional::PadFuncOptions({0, 0, 0, (256 - numel % 256) / had_size}));
+    }
+    
+    if (x.stride(-1) != 1) {
+        x = x.contiguous();
+    }
+    torch::Tensor out = inplace ? x : torch::empty_like(x);
+
+    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    VLLM_DISPATCH_HALF_TYPES(x.scalar_type(), "hadacore_transform_runfht", [&] {
+      auto constexpr SCALAR_TYPE = c10::CppTypeToScalarType<scalar_t>::value;
+      hadacore::run_fht<SCALAR_TYPE>(x.data_ptr(), x.data_ptr(), x.numel(), had_size, stream);
+    });
+
+    if (numel % 256 != 0) {
+        out = out.index({torch::indexing::Slice(0, numel / had_size)});
+    }
+
+    if (inplace && out.data_ptr() != x.data_ptr()) {
+        x.copy_(out.view(res_shape));
+        return x;
+    }
+    return out.reshape(res_shape);
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+    m.impl("hadacore_transform", &hadacore_transform);
+}
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -30,6 +30,10 @@
  #define __HIP__GFX9__
 #endif

+#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
+  #define __HIP__FP8MFMA__
+#endif
+
 #if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__))
  #define __HIP__GFX11__
 #endif
@ -51,6 +55,12 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))

+enum class MFMAType {
+  F16 = 0,
+  Fp8 = 1,
+  Fp4 = 2,
+};
+
 #if defined(__HIP__GFX9__)

  #define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32
@ -112,6 +122,21 @@ __device__ __forceinline__ floatx4 gcn_mfma16x16x16_instr(const _B16x4& inpA,
  }
 }

+template <typename T, int absz, int cbid, int blgp>
+__device__ __forceinline__ floatx4 gcn_mfma16x16x32_instr(const long& inpA,
+                                                          const long& inpB,
+                                                          const floatx4& inpC) {
+  if constexpr (std::is_same<T, __hip_fp8_e4m3>::value) {
+    return __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(inpA, inpB, inpC, absz,
+                                                      cbid, blgp);
+  } else if constexpr (std::is_same<T, __hip_fp8_e5m2>::value) {
+    return __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(inpA, inpB, inpC, absz,
+                                                      cbid, blgp);
+  } else {
+    static_assert(false, "unsupported 8b dtype");
+  }
+}
+
 template <typename T>
 __device__ __forceinline__ float to_float(const T& inp) {
  if constexpr (std::is_same<T, _Float16>::value) {
@ -256,12 +281,44 @@ __device__ __forceinline__ _B16x8 convert_b8x8_custom(const _B8x8 input) {
  return ret;
 }

+typedef union u64_cvt {
+  half f16x4[4];
+  int16_t b16x4[4];
+  _B8x8 b8x8;
+  _B16x4 b64;
+  int64_t i64;
+} _T8x8;
+
+__device__ __forceinline__ _B8x8 convert_b16x8(const _B16x8& input,
+                                               _T8x8& Mtemp) {
+  _T8x8 Qtmp8x8;
+
+  for (int i = 0; i < 2; i++) {
+    floatx4 q_out = {0, 0, 0, 0};
+    q_out = gcn_mfma16x16x16_instr<_Float16, 0, 0, 0>(Mtemp.b64, input.xy[i],
+                                                      q_out);
+    Qtmp8x8.b16x4[i * 2] =
+        __builtin_amdgcn_cvt_pk_fp8_f32(q_out[0], q_out[1], 0, false);
+    Qtmp8x8.b16x4[i * 2 + 1] =
+        __builtin_amdgcn_cvt_pk_fp8_f32(q_out[2], q_out[3], 0, false);
+  }
+  return Qtmp8x8.b8x8;
+}
+
+__device__ float warpReduceMax(float val) {
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    val = max(
+        val, __shfl_down(val, offset, WARP_SIZE));  // Using max() for reduction
+  }
+  return val;
+}
+
 // grid (num_seqs, num_partitions,num_kv_heads)
 // block (256)
 // clang-format off
 template <typename scalar_t, typename cache_t,
          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
-          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO, MFMAType MFMA_TYPE>
 __global__
 __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
@ -367,6 +424,10 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
  const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;

  int kphysical_block_number[TLOOP];
+  #if defined(__HIP__FP8MFMA__)
+  float q_max = 0;
+  float q_scale = 1.0;
+  #endif

  // fetch k physical block numbers
  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
@ -416,6 +477,15 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
        Qlocal[qkhe_depth][qkratio].xy[i] =
            shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO]
                         [2 * qkratio + i];
+  #if defined(__HIP__FP8MFMA__)
+        if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto &&
+                      MFMA_TYPE == MFMAType::Fp8) {
+          scalar_t* qptr =
+              reinterpret_cast<scalar_t*>(&Qlocal[qkhe_depth][qkratio].xy[i]);
+          for (int k = 0; k < 4; k++)
+            q_max = fmax(fabs(to_float<scalar_t>(qptr[k])), q_max);
+        }
+  #endif
      }
    }
  }
@ -515,6 +585,14 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
  if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
    // multiply by k_scale if fp8 kv cache
    scale2 *= *k_scale;
+  #if defined(__HIP__FP8MFMA__)
+    q_max = warpReduceMax(q_max);
+    constexpr float FP8_E4M3_SCALE_TARGET = 224.0f;
+    if constexpr (MFMA_TYPE == MFMAType::Fp8) {
+      q_scale = q_max > 0 ? FP8_E4M3_SCALE_TARGET / q_max : 1.0f;
+      scale2 /= q_scale;
+    }
+  #endif
  }

  floatx4 d_out[TLOOP];
@ -534,12 +612,41 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
        auto Ktmp = Klocal[token_depth][qkhe_depth];
        _B8x16 Ktmp8x16 = *reinterpret_cast<_B8x16*>(&Ktmp);
        for (int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) {
-          _B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio];
-          _B16x8 Klocaltmp = convert_b8x8_custom<scalar_t>(Ktmp8x8);
-          for (int i = 0; i < 2; i++) {
-            d_out[token_depth] = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
-                Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i],
-                d_out[token_depth]);
+          if constexpr (MFMA_TYPE == MFMAType::F16) {
+            _B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio];
+            _B16x8 Klocaltmp = convert_b8x8_custom<scalar_t>(Ktmp8x8);
+            for (int i = 0; i < 2; i++) {
+              d_out[token_depth] = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
+                  Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i],
+                  d_out[token_depth]);
+            }
+          } else {
+  #if defined(__HIP__FP8MFMA__)
+            _T8x8 Ktmp8x8, Qtmp8x8;
+            Ktmp8x8.b8x8 = Ktmp8x16.xy[qkratio];
+
+            for (int n = 0; n < 2; n++) {
+              scalar_t* qptr = reinterpret_cast<scalar_t*>(
+                  &Qlocal[qkhe_depth][qkratio].xy[n]);
+
+              Qtmp8x8.b16x4[n * 2] =
+                  vllm::fp8::scaled_vec_conversion<uint16_t, float2>(
+                      make_float2(to_float<scalar_t>(qptr[0]),
+                                  to_float<scalar_t>(qptr[1])),
+                      q_scale);
+              Qtmp8x8.b16x4[n * 2 + 1] =
+                  vllm::fp8::scaled_vec_conversion<uint16_t, float2>(
+                      make_float2(to_float<scalar_t>(qptr[2]),
+                                  to_float<scalar_t>(qptr[3])),
+                      q_scale);
+            }
+
+            d_out[token_depth] =
+                gcn_mfma16x16x32_instr<__hip_fp8_e4m3, 0, 0, 0>(
+                    Ktmp8x8.i64, Qtmp8x8.i64, d_out[token_depth]);
+  #else
+            UNREACHABLE_CODE
+  #endif
          }
        }
      }
@ -629,17 +736,36 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
  // disable rtz conversion due to its impact on accuracy.
  constexpr bool LOGITS_RTZ_CONVERSION = false;

+  #if defined(__HIP__FP8MFMA__)
+  int rowid_8x8 = rowid / 2;
+  int offset = rowid % 2;
+  #endif
+
  // write logits to shared mem
  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
    d_out[token_depth] *= inv_sum_scale;
-    if constexpr (LOGITS_RTZ_CONVERSION) {
-      // use rtz conversion for better performance, with negligible impact on
-      // accuracy
-      shared_logits[warpid][token_depth][lane16id][rowid] =
-          from_floatx4_rtz<scalar_t>(d_out[token_depth]);
+    if constexpr (MFMA_TYPE != MFMAType::Fp8) {
+      if constexpr (LOGITS_RTZ_CONVERSION) {
+        // use rtz conversion for better performance, with negligible impact on
+        // accuracy
+        shared_logits[warpid][token_depth][lane16id][rowid] =
+            from_floatx4_rtz<scalar_t>(d_out[token_depth]);
+      } else {
+        shared_logits[warpid][token_depth][lane16id][rowid] =
+            from_floatx4<scalar_t>(d_out[token_depth]);
+      }
    } else {
-      shared_logits[warpid][token_depth][lane16id][rowid] =
-          from_floatx4<scalar_t>(d_out[token_depth]);
+  #if defined(__HIP__FP8MFMA__)
+      // cast _B16x4* to _B8x8*
+      _T8x8& logits_8x8 = *reinterpret_cast<_T8x8*>(
+          &shared_logits[warpid][token_depth][lane16id][rowid_8x8]);
+      logits_8x8.b16x4[offset * 2] = __builtin_amdgcn_cvt_pk_fp8_f32(
+          d_out[token_depth][0], d_out[token_depth][1], 0, false);
+      logits_8x8.b16x4[offset * 2 + 1] = __builtin_amdgcn_cvt_pk_fp8_f32(
+          d_out[token_depth][2], d_out[token_depth][3], 0, false);
+  #else
+      UNREACHABLE_CODE
+  #endif
    }
  }

@ -692,19 +818,42 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
          _B8x16 Vtmp8x16 = *reinterpret_cast<_B8x16*>(&Vtmp);
          for (int j = 0; j < ELEMS16_ELEMS8_RATIO; j++) {
            _B8x8 Vtmp8x8 = Vtmp8x16.xy[j];
-            _B16x8 Vlocaltmp = convert_b8x8_custom<scalar_t>(Vtmp8x8);
-            for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) {
-              const int offset =
-                  rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO +
-                  j * ELEMS8_ELEMS4_RATIO + i;
-              const int offset1 = offset % ROWS_PER_WARP;
-              const int offset2 = offset / ROWS_PER_WARP;
-              // output format is 16 qheads across 16 lanes, 16 head elems
-              // spread across 4 rows
-              tmp_out = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
-                  Vlocaltmp.xy[i],
-                  shared_logits[vtoken_depth][offset2][lane16id][offset1],
-                  tmp_out);
+            if constexpr (MFMA_TYPE == MFMAType::F16) {
+              _B16x8 Vlocaltmp = convert_b8x8_custom<scalar_t>(Vtmp8x8);
+              for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) {
+                const int offset =
+                    rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO +
+                    j * ELEMS8_ELEMS4_RATIO + i;
+                const int offset1 = offset % ROWS_PER_WARP;
+                const int offset2 = offset / ROWS_PER_WARP;
+                // output format is 16 qheads across 16 lanes, 16 head elems
+                // spread across 4 rows
+                tmp_out = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
+                    Vlocaltmp.xy[i],
+                    shared_logits[vtoken_depth][offset2][lane16id][offset1],
+                    tmp_out);
+              }
+            } else {
+  #if defined(__HIP__FP8MFMA__)
+              for (int i = 0; i < ELEMS8_ELEMS4_RATIO / 2; i++) {
+                const int offset =
+                    rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO +
+                    j * ELEMS8_ELEMS4_RATIO + i;
+                const int offset1 = (offset % ROWS_PER_WARP) / 2;
+                const int offset2 = offset / ROWS_PER_WARP;
+                // output format is 16 qheads across 16 lanes, 16 head elems
+                // spread across 4 rows
+                tmp_out = gcn_mfma16x16x32_instr<__hip_fp8_e4m3, 0, 0, 0>(
+                    reinterpret_cast<_T8x8*>(&Vtmp8x8)->i64,
+                    reinterpret_cast<_T8x8*>(
+                        &shared_logits[vtoken_depth][offset2][lane16id]
+                                      [offset1])
+                        ->i64,
+                    tmp_out);
+              }
+  #else
+              UNREACHABLE_CODE
+  #endif
            }
          }
        }
@ -1570,7 +1719,8 @@ __device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) {
 // clang-format off
 template <typename scalar_t, typename cache_t,
          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
-          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO,
+          MFMAType MFMA_TYPE>
 __global__
 __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
@ -2337,7 +2487,8 @@ __device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) {
 // clang-format off
 template <typename scalar_t, typename cache_t,
          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
-          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO,
+          MFMAType MFMA_TYPE>
 __global__
 __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
@ -2969,7 +3120,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 template <typename scalar_t, typename cache_t,
          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
-          int GQA_RATIO>
+          int GQA_RATIO, MFMAType MFMA_TYPE>
 __global__
 __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
@ -3041,7 +3192,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 #define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
-                                          GQA_RATIO>                           \
+                                          GQA_RATIO, MFMA_TYPE>                \
      <<<grid, block, 0, stream>>>(                                            \
          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
          block_tables_ptr, seq_lens_ptr, query_start_loc_ptr,                 \
@ -3069,7 +3220,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

 template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
          int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
-          bool ALIBI_ENABLED>
+          bool ALIBI_ENABLED, MFMAType MFMA_TYPE>
 void paged_attention_custom_launcher(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
@ -3225,7 +3376,7 @@ void paged_attention_custom_launcher(

 template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
          int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
-          bool ALIBI_ENABLED>
+          bool ALIBI_ENABLED, MFMAType MFMA_TYPE>
 void paged_attention_custom_launcher_navi(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
@ -3397,74 +3548,77 @@ void paged_attention_custom_launcher_navi(
 }

 #define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT,   \
-                             PSIZE, ALIBI_ENABLED)                          \
+                             PSIZE, ALIBI_ENABLED, MFMA_TYPE)               \
  if (!is_navi) {                                                           \
    paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,  \
-                                    OUTT, PSIZE, ALIBI_ENABLED>(            \
+                                    OUTT, PSIZE, ALIBI_ENABLED, MFMA_TYPE>( \
        out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,  \
        num_kv_heads, scale, block_tables, seq_lens, query_start_loc,       \
        max_seq_len, alibi_slopes, k_scale, v_scale, fp8_out_scale);        \
  } else {                                                                  \
-    paged_attention_custom_launcher_navi<                                   \
-        T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \
+    paged_attention_custom_launcher_navi<T, KVT, KV_DTYPE, BLK_SIZE,        \
+                                         HEAD_SIZE, OUTT, PSIZE,            \
+                                         ALIBI_ENABLED, MFMA_TYPE>(         \
        out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,  \
        num_kv_heads, scale, block_tables, seq_lens, query_start_loc,       \
        max_seq_len, alibi_slopes, k_scale, v_scale);                       \
  }

 #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,    \
-                                   OUTT, PSIZE)                              \
+                                   OUTT, PSIZE, MFMA_TYPE)                   \
  if (alibi_slopes) {                                                        \
    CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, \
-                         true);                                              \
+                         true, MFMA_TYPE);                                   \
  } else {                                                                   \
    CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, \
-                         false);                                             \
+                         false, MFMA_TYPE);                                  \
  }

 #if defined(__HIPCC__) && defined(__gfx90a__)
-  #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE)  \
+  #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,  \
+                                   MFMA_TYPE)                              \
    if (fp8_out_scale) {                                                   \
      TORCH_CHECK(false, "fp8 out scale unsupported for gfx90a");          \
    } else {                                                               \
      CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
-                                 256);                                     \
+                                 256, MFMA_TYPE);                          \
    }
 #else
-  #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE)  \
+  #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,  \
+                                   MFMA_TYPE)                              \
    if (fp8_out_scale) {                                                   \
      CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,    \
-                                 uint8_t, 256);                            \
+                                 uint8_t, 256, MFMA_TYPE);                 \
    } else {                                                               \
      CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
-                                 256);                                     \
+                                 256, MFMA_TYPE);                          \
    }
 #endif

-#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE)     \
-  switch (block_size) {                                           \
-    case 16:                                                      \
-      CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 16, HEAD_SIZE);  \
-      break;                                                      \
-    case 32:                                                      \
-      CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 32, HEAD_SIZE);  \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
+#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE, MFMA_TYPE)    \
+  switch (block_size) {                                                     \
+    case 16:                                                                \
+      CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 16, HEAD_SIZE, MFMA_TYPE); \
+      break;                                                                \
+    case 32:                                                                \
+      CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 32, HEAD_SIZE, MFMA_TYPE); \
+      break;                                                                \
+    default:                                                                \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size);           \
+      break;                                                                \
  }

-#define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE)         \
-  switch (head_size) {                                          \
-    case 64:                                                    \
-      CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 64);           \
-      break;                                                    \
-    case 128:                                                   \
-      CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 128);          \
-      break;                                                    \
-    default:                                                    \
-      TORCH_CHECK(false, "Unsupported head size: ", head_size); \
-      break;                                                    \
+#define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE, MFMA_TYPE) \
+  switch (head_size) {                                             \
+    case 64:                                                       \
+      CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 64, MFMA_TYPE);   \
+      break;                                                       \
+    case 128:                                                      \
+      CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 128, MFMA_TYPE);  \
+      break;                                                       \
+    default:                                                       \
+      TORCH_CHECK(false, "Unsupported head size: ", head_size);    \
+      break;                                                       \
  }

 bool is_navi_gpu() {
@ -3503,28 +3657,43 @@ void paged_attention(
    const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
    torch::Tensor& v_scale,
-    const std::optional<torch::Tensor>& fp8_out_scale) {
+    const std::optional<torch::Tensor>& fp8_out_scale,
+    const std::string& mfma_type) {
  // clang-format on
  bool is_navi = is_navi_gpu();
-
  const int head_size = query.size(2);
  if (kv_cache_dtype == "auto") {
    if (query.dtype() == at::ScalarType::Half) {
-      CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, _Float16,
-                                    vllm::Fp8KVCacheDataType::kAuto);
+      CALL_CUSTOM_LAUNCHER_BLK_HEAD(
+          _Float16, _Float16, vllm::Fp8KVCacheDataType::kAuto, MFMAType::F16);
    } else if (query.dtype() == at::ScalarType::BFloat16) {
      CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, __hip_bfloat16,
-                                    vllm::Fp8KVCacheDataType::kAuto);
+                                    vllm::Fp8KVCacheDataType::kAuto,
+                                    MFMAType::F16);
    } else {
      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
    }
  } else if (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3") {
    if (query.dtype() == at::ScalarType::Half) {
-      CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t,
-                                    vllm::Fp8KVCacheDataType::kFp8E4M3);
+      if (mfma_type == "fp8") {
+        CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t,
+                                      vllm::Fp8KVCacheDataType::kFp8E4M3,
+                                      MFMAType::Fp8);
+      } else {
+        CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t,
+                                      vllm::Fp8KVCacheDataType::kFp8E4M3,
+                                      MFMAType::F16);
+      }
    } else if (query.dtype() == at::ScalarType::BFloat16) {
-      CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t,
-                                    vllm::Fp8KVCacheDataType::kFp8E4M3);
+      if (mfma_type == "fp8") {
+        CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t,
+                                      vllm::Fp8KVCacheDataType::kFp8E4M3,
+                                      MFMAType::Fp8);
+      } else {
+        CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t,
+                                      vllm::Fp8KVCacheDataType::kFp8E4M3,
+                                      MFMAType::F16);
+      }
    } else {
      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
    }
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@ -19,4 +19,5 @@ void paged_attention(
    const std::optional<torch::Tensor>& query_start_loc, int64_t block_size,
    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale);
+    torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale,
+    const std::string& mfma_type);
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@ -48,7 +48,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
      "                Tensor? alibi_slopes,"
      "                str kv_cache_dtype,"
      "                Tensor k_scale, Tensor v_scale,"
-      "                Tensor? fp8_out_scale) -> ()");
+      "                Tensor? fp8_out_scale,"
+      "                str mfma_type) -> ()");
  rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
 }

--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -32,6 +32,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  #define stride_tag
 #endif

+  ops.def(
+      "silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! "
+      "y_q, Tensor! y_s, int group_size, "
+      "bool use_ue8m0, int num_parallel_tokens) -> ()");
+  ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA,
+           &silu_mul_fp8_quant_deep_gemm_cuda);
+
  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);

@ -168,6 +175,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "float epsilon) -> ()");
  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);

+  // Polynomial Normalization.
+  ops.def(
+      "poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
+      "epsilon) -> ()");
+  ops.impl("poly_norm", torch::kCUDA, &poly_norm);
+
  // Apply repetition penalties to logits in-place
  ops.def(
      "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
@ -208,16 +221,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
  ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);

-  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key
-  // (supports multiple loras).
-  ops.def(
-      "batched_rotary_embedding(Tensor positions, Tensor! query,"
-      "                         Tensor!? key, int head_size,"
-      "                         Tensor cos_sin_cache, bool is_neox,"
-      "                         int rot_dim,"
-      "                         Tensor cos_sin_cache_offsets) -> ()");
-  ops.impl("batched_rotary_embedding", torch::kCUDA, &batched_rotary_embedding);
-
  // Quantization ops
 #ifndef USE_ROCM
  // Quantized GEMM for AWQ.
@ -507,13 +510,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]");
  ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);

-  // CUTLASS MLA decode
-  ops.def(
-      "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
-      "                   Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
-      "                   Tensor page_table, float scale) -> ()");
-  ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
-
  // SM100 CUTLASS MLA decode
  ops.def(
      "sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope,"
@ -610,6 +606,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "int pad_slot_id) -> ()");
  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);

+  // Hadamard transforms
+  ops.def("hadacore_transform(Tensor! x, bool inplace) -> Tensor");
+
 #ifndef USE_ROCM
  // Compute per-token-group FP8 quantized tensor and scaling factor.
  ops.def(
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -196,6 +196,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0

 # Flag to control whether to use pre-built vLLM wheels
 ARG VLLM_USE_PRECOMPILED=""
+ARG VLLM_MAIN_CUDA_VERSION=""

 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
@ -213,6 +214,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        && export SCCACHE_IDLE_TIMEOUT=0 \
        && export CMAKE_BUILD_TYPE=Release \
        && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
+        && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
        && sccache --show-stats \
        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
@ -281,6 +283,10 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM

+ARG GDRCOPY_CUDA_VERSION=12.8
+# Keep in line with FINAL_BASE_IMAGE
+ARG GDRCOPY_OS_VERSION=Ubuntu22_04
+
 SHELL ["/bin/bash", "-c"]

 ARG DEADSNAKES_MIRROR_URL
@ -375,7 +381,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.3.0"
+ARG FLASHINFER_GIT_REF="v0.3.1"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
@ -439,13 +445,21 @@ COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} 

-# Install EP kernels(pplx-kernels and DeepEP), NixL
+COPY tools/install_gdrcopy.sh install_gdrcopy.sh
+RUN set -eux; \
+    case "${TARGETPLATFORM}" in \
+      linux/arm64) UUARCH="aarch64" ;; \
+      linux/amd64) UUARCH="x64" ;; \
+      *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
+    esac; \
+    ./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
+    rm ./install_gdrcopy.sh
+
+# Install EP kernels(pplx-kernels and DeepEP)
 COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
-COPY tools/install_nixl.sh install_nixl.sh
 ENV CUDA_HOME=/usr/local/cuda
 RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
-    && bash install_python_libraries.sh \
-    && bash install_nixl.sh --force
+    && bash install_python_libraries.sh

 #################### vLLM installation IMAGE ####################

@ -519,7 +533,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    else \
        BITSANDBYTES_VERSION="0.46.1"; \
    fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3]

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.


 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.2.2.post1
+# release version: v0.3.1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
    echo "git clone flashinfer..." \
    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
-    && git checkout v0.2.2.post1 \
+    && git checkout v0.3.1 \
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
    && rm -rf build \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -47,6 +47,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite

 # -----------------------
@ -71,7 +72,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 RUN cd /vllm-workspace \
    && rm -rf vllm \
    && python3 -m pip install -e tests/vllm_test_utils \
-    && python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
+    && python3 -m pip install lm-eval[api]==0.4.4 \
    && python3 -m pip install pytest-shard

 # -----------------------
@ -100,8 +101,10 @@ ARG COMMON_WORKDIR
 # Copy over the benchmark scripts as well
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
+COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker

 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false

 # ENV that can improve safe tensor loading, and end-to-end time
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,18 +1,16 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="db8e93b4"
-ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
+ARG HIPBLASLT_BRANCH="aa0bda7b"
+ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
 ARG LEGACY_HIPBLASLT_OPTION=
-ARG RCCL_BRANCH="648a58d"
-ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="295f2ed4"
+ARG PYTORCH_BRANCH="f717b2af"
 ARG PYTORCH_VISION_BRANCH="v0.21.0"
-ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="916bf3c"
+ARG AITER_BRANCH="4822e675"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base
@ -45,7 +43,7 @@ RUN apt-get update -y \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

-RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython

 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@ -53,6 +51,7 @@ ARG HIPBLAS_COMMON_BRANCH
 # Set to "--legacy_hipblas_direct" for ROCm<=6.2
 ARG LEGACY_HIPBLASLT_OPTION
 RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
 RUN cd hipBLAS-common \
    && git checkout ${HIPBLAS_COMMON_BRANCH} \
    && mkdir build \
@ -69,24 +68,17 @@ RUN cd hipBLASLt \
    && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install

-FROM base AS build_rccl
-ARG RCCL_BRANCH
-ARG RCCL_REPO
-RUN git clone ${RCCL_REPO}
-RUN cd rccl \
-    && git checkout ${RCCL_BRANCH} \
-    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
-RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
-
 FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 RUN git clone ${TRITON_REPO}
 RUN cd triton \
    && git checkout ${TRITON_BRANCH} \
-    && cd python \
-    && python3 setup.py bdist_wheel --dist-dir=dist
-RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
+    && if [ ! -f setup.py ]; then cd python; fi \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && mkdir -p /app/install && cp dist/*.whl /app/install
+RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \
+    && python3 -m build --wheel && cp dist/*.whl /app/install; fi

 FROM base AS build_amdsmi
 RUN cd /opt/rocm/share/amd_smi \
@ -132,15 +124,25 @@ RUN cd aiter \
 RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install

+FROM base AS debs
+RUN mkdir /app/debs
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    cp /install/*.deb /app/debs
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
-    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
-RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
-    dpkg -i /install/*deb \
-    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
-    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+    && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
+    && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
+    && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
@ -154,8 +156,6 @@ ARG BASE_IMAGE
 ARG HIPBLAS_COMMON_BRANCH
 ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
-ARG RCCL_BRANCH
-ARG RCCL_REPO
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 ARG PYTORCH_BRANCH
@ -170,8 +170,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
-    && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
-    && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
@ -180,4 +178,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@ -44,11 +44,12 @@ nav:
      - contributing/model/registration.md
      - contributing/model/tests.md
      - contributing/model/multimodal.md
+      - contributing/model/transcription.md
    - CI: contributing/ci
    - Design Documents: design
  - API Reference:
    - api/README.md
-    - api/vllm/*
+    - api/vllm
  - CLI Reference: cli
  - Community:
    - community/*
--- a/docs/README.md
+++ b/docs/README.md
@ -56,7 +56,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support

--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -230,6 +230,20 @@ Multi-modal IPC caching is automatically enabled when
 there is a one-to-one correspondence between API (`P0`) and engine core (`P1`) processes,
 to avoid repeatedly transferring the same multi-modal inputs between them.

+#### Key-Replicated Cache
+
+By default, IPC caching uses a **key-replicated cache**, where cache keys exist
+in both the API (`P0`) and engine core (`P1`) processes, but the actual cache
+data resides only in `P1`.
+
+#### Shared Memory Cache
+
+When multiple worker processes are involved (e.g., when TP > 1), a
+**shared-memory cache** is more efficient. This can be enabled by setting
+`mm_processor_cache_type="shm"`. In this mode, cache keys are stored
+on `P0`, while the cache data itself lives in shared memory accessible by all
+processes.
+
 ### Configuration

 You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB).
@ -244,6 +258,12 @@ Examples:
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
          mm_processor_cache_gb=8)

+# Use a shared-memory based IPC cache
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          tensor_parallel_size=2,
+          mm_processor_cache_type="shm",
+          mm_processor_cache_gb=8)
+
 # Disable the cache
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
          mm_processor_cache_gb=0)
@ -253,11 +273,12 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",

 Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:

-| Processor Caching | IPC Caching | `P0` Cache | `P1` Cache | Max. Memory |
-|-------------------|-------------|------------|------------|-------------|
-| ✅ | ✅ | K | K + V | `mm_processor_cache_gb * data_parallel_size` |
-| ✅ | ❌ | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
-| ❌ | ❌ | N/A | N/A | `0` |
+| mm_processor_cache_type | Cache Type | `P0` Cache | `P1` Engine Cache | `P1` Worker Cache | Max. Memory |
+|-------------------|-------------|------------|------------|-------------|-------------|
+| lru | Processor Caching | K + V | N/A | N/A | `mm_processor_cache_gb * data_parallel_size` |
+| lru | Key-Replicated Caching | K | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
+| shm | Shared Memory Caching | K | N/A | V | `mm_processor_cache_gb * api_server_count` |
+| N/A | Disabled | N/A | N/A | N/A | `0` |

 K: Stores the hashes of multi-modal items  
 V: Stores the processed tensor data of multi-modal items
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -1,9 +1,790 @@
+---
+toc_depth: 4
+---
+
 # Benchmark Suites

-vLLM contains two sets of benchmarks:
+vLLM provides comprehensive benchmarking tools for performance testing and evaluation:

- [Performance benchmarks][performance-benchmarks]
- [Nightly benchmarks][nightly-benchmarks]
+- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
+- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
+- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
+
+[Benchmark CLI]: #benchmark-cli
+
+## Benchmark CLI
+
+This section guides you through running benchmark tests with the extensive
+datasets supported on vLLM. It's a living document, updated as new features and datasets
+become available.
+
+### Dataset Overview
+
+<style>
+th {
+  min-width: 0 !important;
+}
+</style>
+
+| Dataset | Online | Offline | Data Path |
+|---------|--------|---------|-----------|
+| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
+| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
+| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
+| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
+| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
+| Random | ✅ | ✅ | `synthetic` |
+| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
+| Prefix Repetition | ✅ | ✅ | `synthetic` |
+| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
+| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
+| HuggingFace-InstructCoder | ✅ | ✅ | `likaixin/InstructCoder` |
+| HuggingFace-AIMO | ✅ | ✅ | `AI-MO/aimo-validation-aime`, `AI-MO/NuminaMath-1.5`, `AI-MO/NuminaMath-CoT` |
+| HuggingFace-Other | ✅ | ✅ | `lmms-lab/LLaVA-OneVision-Data`, `Aeala/ShareGPT_Vicuna_unfiltered` |
+| HuggingFace-MTBench | ✅ | ✅ | `philschmid/mt-bench` |
+| HuggingFace-Blazedit | ✅ | ✅ | `vdaita/edit_5k_char`, `vdaita/edit_10k_char` |
+| Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` |
+| Custom | ✅ | ✅ | Local file: `data.jsonl` |
+
+Legend:
+
+- ✅ - supported
+- 🟡 - Partial support
+- 🚧 - to be supported
+
+!!! note
+    HuggingFace dataset's `dataset-name` should be set to `hf`.
+    For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
+
+    ```bash
+    --dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
+    ```
+
+### Examples
+
+#### 🚀 Online Benchmark
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+First start serving your model
+
+```bash
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B
+```
+
+Then run the benchmarking script
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --num-prompts 10
+```
+
+If successful, you will see the following output
+
+```text
+============ Serving Benchmark Result ============
+Successful requests:                     10
+Benchmark duration (s):                  5.78
+Total input tokens:                      1369
+Total generated tokens:                  2212
+Request throughput (req/s):              1.73
+Output token throughput (tok/s):         382.89
+Total Token throughput (tok/s):          619.85
+---------------Time to First Token----------------
+Mean TTFT (ms):                          71.54
+Median TTFT (ms):                        73.88
+P99 TTFT (ms):                           79.49
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          7.91
+Median TPOT (ms):                        7.96
+P99 TPOT (ms):                           8.03
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           7.74
+Median ITL (ms):                         7.70
+P99 ITL (ms):                            8.39
+==================================================
+```
+
+##### Custom Dataset
+
+If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
+
+```json
+{"prompt": "What is the capital of India?"}
+{"prompt": "What is the capital of Iran?"}
+{"prompt": "What is the capital of China?"}
+```
+
+```bash
+# start server
+VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
+```
+
+```bash
+# run benchmarking script
+vllm bench serve --port 9001 --save-result --save-detailed \
+  --backend vllm \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --endpoint /v1/completions \
+  --dataset-name custom \
+  --dataset-path <path-to-your-data-jsonl> \
+  --custom-skip-chat-template \
+  --num-prompts 80 \
+  --max-concurrency 1 \
+  --temperature=0.3 \
+  --top-p=0.75 \
+  --result-dir "./log/"
+```
+
+You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
+
+##### VisionArena Benchmark for Vision Language Models
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct
+```
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --endpoint-type openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --hf-split train \
+  --num-prompts 1000
+```
+
+##### InstructCoder Benchmark with Speculative Decoding
+
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+
+``` bash
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name hf \
+    --dataset-path likaixin/InstructCoder \
+    --num-prompts 2048
+```
+
+##### Spec Bench Benchmark with Speculative Decoding
+
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+
+[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
+
+Run all categories:
+
+``` bash
+# Download the dataset using:
+# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
+
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name spec_bench \
+    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
+    --num-prompts -1
+```
+
+Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
+
+Run only a specific category like "summarization":
+
+``` bash
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name spec_bench \
+    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
+    --num-prompts -1
+    --spec-bench-category "summarization"
+```
+
+##### Other HuggingFaceDataset Examples
+
+```bash
+vllm serve Qwen/Qwen2-VL-7B-Instruct
+```
+
+`lmms-lab/LLaVA-OneVision-Data`:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --endpoint-type openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
+```
+
+`Aeala/ShareGPT_Vicuna_unfiltered`:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --endpoint-type openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
+`AI-MO/aimo-validation-aime`:
+
+``` bash
+vllm bench serve \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path AI-MO/aimo-validation-aime \
+    --num-prompts 10 \
+    --seed 42
+```
+
+`philschmid/mt-bench`:
+
+``` bash
+vllm bench serve \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path philschmid/mt-bench \
+    --num-prompts 80
+```
+
+`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
+
+``` bash
+vllm bench serve \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path vdaita/edit_5k_char \
+    --num-prompts 90 \
+    --blazedit-min-distance 0.01 \
+    --blazedit-max-distance 0.99
+```
+
+##### Running With Sampling Parameters
+
+When using OpenAI-compatible backends such as `vllm`, optional sampling
+parameters can be specified. Example client command:
+
+```bash
+vllm bench serve \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --top-k 10 \
+  --top-p 0.9 \
+  --temperature 0.5 \
+  --num-prompts 10
+```
+
+##### Running With Ramp-Up Request Rate
+
+The benchmark tool also supports ramping up the request rate over the
+duration of the benchmark run. This can be useful for stress testing the
+server or finding the maximum throughput that it can handle, given some latency budget.
+
+Two ramp-up strategies are supported:
+
+- `linear`: Increases the request rate linearly from a start value to an end value.
+- `exponential`: Increases the request rate exponentially.
+
+The following arguments can be used to control the ramp-up:
+
+- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
+- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
+- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
+
+</details>
+
+#### 📈 Offline Throughput Benchmark
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+```bash
+vllm bench throughput \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset-name sonnet \
+  --dataset-path vllm/benchmarks/sonnet.txt \
+  --num-prompts 10
+```
+
+If successful, you will see the following output
+
+```text
+Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
+Total num prompt tokens:  5014
+Total num output tokens:  1500
+```
+
+##### VisionArena Benchmark for Vision Language Models
+
+```bash
+vllm bench throughput \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --num-prompts 1000 \
+  --hf-split train
+```
+
+The `num prompt tokens` now includes image token counts
+
+```text
+Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
+Total num prompt tokens:  14527
+Total num output tokens:  1280
+```
+
+##### InstructCoder Benchmark with Speculative Decoding
+
+``` bash
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+VLLM_USE_V1=1 \
+vllm bench throughput \
+    --dataset-name=hf \
+    --dataset-path=likaixin/InstructCoder \
+    --model=meta-llama/Meta-Llama-3-8B-Instruct \
+    --input-len=1000 \
+    --output-len=100 \
+    --num-prompts=2048 \
+    --async-engine \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+
+```text
+Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
+Total num prompt tokens:  261136
+Total num output tokens:  204800
+```
+
+##### Other HuggingFaceDataset Examples
+
+`lmms-lab/LLaVA-OneVision-Data`:
+
+```bash
+vllm bench throughput \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
+```
+
+`Aeala/ShareGPT_Vicuna_unfiltered`:
+
+```bash
+vllm bench throughput \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
+`AI-MO/aimo-validation-aime`:
+
+```bash
+vllm bench throughput \
+  --model Qwen/QwQ-32B \
+  --backend vllm \
+  --dataset-name hf \
+  --dataset-path AI-MO/aimo-validation-aime \
+  --hf-split train \
+  --num-prompts 10
+```
+
+Benchmark with LoRA adapters:
+
+``` bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench throughput \
+  --model meta-llama/Llama-2-7b-hf \
+  --backend vllm \
+  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --dataset_name sharegpt \
+  --num-prompts 10 \
+  --max-loras 2 \
+  --max-lora-rank 8 \
+  --enable-lora \
+  --lora-path yard1/llama-2-7b-sql-lora-test
+```
+
+</details>
+
+#### 🛠️ Structured Output Benchmark
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Benchmark the performance of structured output generation (JSON, grammar, regex).
+
+##### Server Setup
+
+```bash
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B
+```
+
+##### JSON Schema Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset json \
+  --structured-output-ratio 1.0 \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+##### Grammar-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset grammar \
+  --structure-type grammar \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+##### Regex-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset regex \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+##### Choice-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset choice \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+##### XGrammar Benchmark Dataset
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset xgrammar_bench \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+</details>
+
+#### 📚 Long Document QA Benchmark
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Benchmark the performance of long document question-answering with prefix caching.
+
+##### Basic Long Document QA Test
+
+```bash
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 16 \
+  --document-length 2000 \
+  --output-len 50 \
+  --repeat-count 5
+```
+
+##### Different Repeat Modes
+
+```bash
+# Random mode (default) - shuffle prompts randomly
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode random
+
+# Tile mode - repeat entire prompt list in sequence
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode tile
+
+# Interleave mode - repeat each prompt consecutively
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode interleave
+```
+
+</details>
+
+#### 🗂️ Prefix Caching Benchmark
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Benchmark the efficiency of automatic prefix caching.
+
+##### Fixed Prompt with Prefix Caching
+
+```bash
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-prompts 1 \
+  --repeat-count 100 \
+  --input-length-range 128:256
+```
+
+##### ShareGPT Dataset with Prefix Caching
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --enable-prefix-caching \
+  --num-prompts 20 \
+  --repeat-count 5 \
+  --input-length-range 128:256
+```
+
+##### Prefix Repetition Dataset
+
+```bash
+vllm bench serve \
+  --backend openai \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --dataset-name prefix_repetition \
+  --num-prompts 100 \
+  --prefix-repetition-prefix-len 512 \
+  --prefix-repetition-suffix-len 128 \
+  --prefix-repetition-num-prefixes 5 \
+  --prefix-repetition-output-len 128
+```
+
+</details>
+
+#### ⚡ Request Prioritization Benchmark
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Benchmark the performance of request prioritization in vLLM.
+
+##### Basic Prioritization Test
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority
+```
+
+##### Multiple Sequences per Prompt
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority \
+  --n 2
+```
+
+</details>
+
+#### 👁️ Multi-Modal Benchmark
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Benchmark the performance of multi-modal requests in vLLM.
+
+##### Images (ShareGPT4V)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"image": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4v/images
+```
+
+Send requests with images:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
+##### Videos (ShareGPT4Video)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"video": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4video/videos
+```
+
+Send requests with videos:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
+##### Synthetic Random Images (random-mm)
+
+Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
+
+Notes:
+
+- Works only with online benchmark via the OpenAI  backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
+- Video sampling is not yet implemented.
+
+Start the server (example):
+
+```bash
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
+  --dtype bfloat16 \
+  --max-model-len 16384 \
+  --limit-mm-per-prompt '{"image": 3, "video": 0}' \
+  --mm-processor-kwargs max_pixels=1003520
+```
+
+Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
+
+Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-3B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name random-mm \
+  --num-prompts 100 \
+  --max-concurrency 10 \
+  --random-prefix-len 25 \
+  --random-input-len 300 \
+  --random-output-len 40 \
+  --random-range-ratio 0.2 \
+  --random-mm-base-items-per-request 2 \
+  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
+  --random-mm-bucket-config '{(224, 224, 1): 1.0}' \
+  --request-rate inf \
+  --ignore-eos \
+  --seed 42
+```
+
+The number of items per request can be controlled by passing multiple image buckets:
+
+```bash
+  --random-mm-base-items-per-request 2 \
+  --random-mm-num-mm-items-range-ratio 0.5 \
+  --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
+  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
+```
+
+Flags specific to `random-mm`:
+
+- `--random-mm-base-items-per-request`: base number of multimodal items per request.
+- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
+- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
+- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
+
+Behavioral notes:
+
+- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
+
+How sampling works:
+
+- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
+- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
+- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
+This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
+- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
+
+</details>

 [](){ #performance-benchmarks }

@ -13,22 +794,22 @@ The performance benchmarks are used for development to confirm whether new chang

 ### Manually Trigger the benchmark

-Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.  
+Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
 For CPU environment, please use the image with "-cpu" postfix.

-Here is an example for docker run command for CPU.  
+Here is an example for docker run command for CPU.

 ```bash
 docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
 ```

-Then, run below command inside the docker instance.  
+Then, run below command inside the docker instance.

 ```bash
 bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 ```

-When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.  
+When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.

 #### Runtime environment variables

--- a/docs/contributing/incremental_build.md
+++ b/docs/contributing/incremental_build.md
@ -40,6 +40,16 @@ python tools/generate_cmake_presets.py

 The script will prompt you if it cannot automatically determine certain paths (e.g., `nvcc` or a specific Python executable for your vLLM development environment). Follow the on-screen prompts. If an existing `CMakeUserPresets.json` is found, the script will ask for confirmation before overwriting it.

+**Force overwrite existing file:**
+
+To automatically overwrite an existing `CMakeUserPresets.json` without prompting, use the `--force-overwrite` flag:
+
+```console
+python tools/generate_cmake_presets.py --force-overwrite
+```
+
+This is particularly useful in automated scripts or CI/CD environments where interactive prompts are not desired.
+
 After running the script, a `CMakeUserPresets.json` file will be created in the root of your vLLM repository.

 ### Example `CMakeUserPresets.json`
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@ -3,7 +3,7 @@
 !!! important
    Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!

-vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/compatibility_matrix.md) to optimize their performance.
+vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.

 The complexity of integrating a model into vLLM depends heavily on the model's architecture.
 The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
@ -15,6 +15,7 @@ Read through these pages for a step-by-step guide:
 - [Registering a Model](registration.md)
 - [Unit Testing](tests.md)
 - [Multi-Modal Support](multimodal.md)
+- [Speech-to-Text Support](transcription.md)

 !!! tip
    If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -840,7 +840,6 @@ Some HF processors directly insert feature tokens without replacing anything in
 Examples:

 - BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
 - Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>

 ### Handling prompt updates unrelated to multi-modal data
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@ -0,0 +1,276 @@
+# Speech-to-Text (Transcription/Translation) Support
+
+This document walks you through the steps to add support for speech-to-text (ASR) models to vLLM’s transcription and translation APIs by implementing [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription].
+Please refer to the [supported models](../../models/supported_models.md#transcription) for further guidance.
+
+## Update the base vLLM model
+
+It is assumed you have already implemented your model in vLLM according to the basic model guide. Extend your model with the [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription] interface and implement the following class attributes and methods.
+
+### `supported_languages` and `supports_transcription_only`
+
+Declare supported languages and capabilities:
+
+- The `supported_languages` mapping is validated at init time.
+- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
+
+??? code "supported_languages and supports_transcription_only"
+    ```python
+    from typing import ClassVar, Mapping, Optional, Literal
+    import numpy as np
+    import torch
+    from torch import nn
+
+    from vllm.config import ModelConfig, SpeechToTextConfig
+    from vllm.inputs.data import PromptType
+    from vllm.model_executor.models.interfaces import SupportsTranscription
+    
+    class YourASRModel(nn.Module, SupportsTranscription):
+        # Map of ISO 639-1 language codes to language names
+        supported_languages: ClassVar[Mapping[str, str]] = {
+            "en": "English",
+            "it": "Italian",
+            # ... add more as needed
+        }
+        
+        # If your model only supports audio-conditioned generation
+        # (no text-only generation), enable this flag.
+        supports_transcription_only: ClassVar[bool] = True
+    ```
+
+Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor.models.interfaces.SupportsTranscription.get_speech_to_text_config].
+
+This is for controlling general behavior of the API when serving your model:
+
+??? code "get_speech_to_text_config()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_speech_to_text_config(
+            cls,
+            model_config: ModelConfig,
+            task_type: Literal["transcribe", "translate"],
+        ) -> SpeechToTextConfig:
+            return SpeechToTextConfig(
+                sample_rate=16_000,
+                max_audio_clip_s=30,
+                # Set to None to disable server-side chunking if your
+                # model/processor handles it already
+                min_energy_split_window_size=None,
+            )
+    ```
+
+See [Audio preprocessing and chunking](#audio-preprocessing-and-chunking) for what each field controls.
+
+Implement the prompt construction via [get_generation_prompt][vllm.model_executor.models.interfaces.SupportsTranscription.get_generation_prompt]. The server passes you the resampled waveform and task parameters; you return a valid [PromptType][vllm.inputs.data.PromptType]. There are two common patterns:
+
+#### Multimodal LLM with audio embeddings (e.g., Voxtral, Gemma3n)
+
+Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
+
+??? code "get_generation_prompt()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_generation_prompt(
+            cls,
+            audio: np.ndarray,
+            stt_config: SpeechToTextConfig,
+            model_config: ModelConfig,
+            language: Optional[str],
+            task_type: Literal["transcribe", "translate"],
+            request_prompt: str,
+            to_language: Optional[str],
+        ) -> PromptType:
+            # Example with a free-form instruction prompt
+            task_word = "Transcribe" if task_type == "transcribe" else "Translate"
+            prompt = (
+                "<start_of_turn>user\n"
+                f"{task_word} this audio: <audio_soft_token>"
+                "<end_of_turn>\n<start_of_turn>model\n"
+            )
+
+            return {
+                "multi_modal_data": {"audio": (audio, stt_config.sample_rate)},
+                "prompt": prompt,
+            }
+    ```
+
+    For further clarification on multi modal inputs, please refer to [Multi-Modal Inputs](../../features/multimodal_inputs.md).
+
+#### Encoder–decoder audio-only (e.g., Whisper)
+
+Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
+
+??? code "get_generation_prompt()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_generation_prompt(
+            cls,
+            audio: np.ndarray,
+            stt_config: SpeechToTextConfig,
+            model_config: ModelConfig,
+            language: Optional[str],
+            task_type: Literal["transcribe", "translate"],
+            request_prompt: str,
+            to_language: Optional[str],
+        ) -> PromptType:
+            if language is None:
+                raise ValueError("Language must be specified")
+
+            prompt = {
+                "encoder_prompt": {
+                    "prompt": "",
+                    "multi_modal_data": {
+                        "audio": (audio, stt_config.sample_rate),
+                    },
+                },
+                "decoder_prompt": (
+                    (f"<|prev|>{request_prompt}" if request_prompt else "")
+                    + f"<|startoftranscript|><|{language}|>"
+                    + f"<|{task_type}|><|notimestamps|>"
+                ),
+            }
+            return cast(PromptType, prompt)
+    ```
+
+### `validate_language` (optional)
+
+Language validation via [validate_language][vllm.model_executor.models.interfaces.SupportsTranscription.validate_language]
+
+If your model requires a language and you want a default, override this method (see Whisper):
+
+??? code "validate_language()"
+    ```python
+    @classmethod
+    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+        if language is None:
+            logger.warning(
+                "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
+            language = "en"
+        return super().validate_language(language)
+    ```
+
+### `get_num_audio_tokens` (optional)
+
+Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.models.interfaces.SupportsTranscription.get_num_audio_tokens]
+
+Provide a fast duration→token estimate to improve streaming usage statistics:
+
+??? code "get_num_audio_tokens()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_num_audio_tokens(
+            cls,
+            audio_duration_s: float,
+            stt_config: SpeechToTextConfig,
+            model_config: ModelConfig,
+        ) -> Optional[int]:
+            # Return None if unknown; otherwise return an estimate.
+            return int(audio_duration_s * stt_config.sample_rate // 320)  # example
+    ```
+
+## Audio preprocessing and chunking
+
+The API server takes care of basic audio I/O and optional chunking before building prompts:
+
+- Resampling: Input audio is resampled to `SpeechToTextConfig.sample_rate` using `librosa`.
+- Chunking: If `SpeechToTextConfig.allow_audio_chunking` is True and the duration exceeds `max_audio_clip_s`, the server splits the audio into overlapping chunks and generates a prompt per chunk. Overlap is controlled by `overlap_chunk_second`.
+- Energy-aware splitting: When `min_energy_split_window_size` is set, the server finds low-energy regions to minimize cutting within words.
+
+Relevant server logic:
+
+??? code "_preprocess_speech_to_text()"
+    ```python
+    # vllm/entrypoints/openai/speech_to_text.py
+    async def _preprocess_speech_to_text(...):
+        language = self.model_cls.validate_language(request.language)
+        ...
+        y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
+        duration = librosa.get_duration(y=y, sr=sr)
+        do_split_audio = (self.asr_config.allow_audio_chunking
+                        and duration > self.asr_config.max_audio_clip_s)
+        chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
+        prompts = []
+        for chunk in chunks:
+            prompt = self.model_cls.get_generation_prompt(
+                audio=chunk,
+                stt_config=self.asr_config,
+                model_config=self.model_config,
+                language=language,
+                task_type=self.task_type,
+                request_prompt=request.prompt,
+                to_language=to_language,
+            )
+            prompts.append(prompt)
+        return prompts, duration
+    ```
+
+## Exposing tasks automatically
+
+vLLM automatically advertises transcription support if your model implements the interface:
+
+```python
+if supports_transcription(model):
+    if model.supports_transcription_only:
+        return ["transcription"]
+    supported_tasks.append("transcription")
+```
+
+When enabled, the server initializes the transcription and translation handlers:
+
+```python
+state.openai_serving_transcription = OpenAIServingTranscription(...) if "transcription" in supported_tasks else None
+state.openai_serving_translation = OpenAIServingTranslation(...) if "transcription" in supported_tasks else None
+```
+
+No extra registration is required beyond having your model class available via the model registry and implementing `SupportsTranscription`.
+
+## Examples in-tree
+
+- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
+- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
+- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
+
+## Test with the API
+
+Once your model implements `SupportsTranscription`, you can test the endpoints (API mimics OpenAI):
+
+- Transcription (ASR):
+
+    ```bash
+    curl -s -X POST \
+      -H "Authorization: Bearer $VLLM_API_KEY" \
+      -H "Content-Type: multipart/form-data" \
+      -F "file=@/path/to/audio.wav" \
+      -F "model=$MODEL_ID" \
+      http://localhost:8000/v1/audio/transcriptions
+    ```
+
+- Translation (source → English unless otherwise supported):
+
+    ```bash
+    curl -s -X POST \
+      -H "Authorization: Bearer $VLLM_API_KEY" \
+      -H "Content-Type: multipart/form-data" \
+      -F "file=@/path/to/audio.wav" \
+      -F "model=$MODEL_ID" \
+      http://localhost:8000/v1/audio/translations
+    ```
+
+Or check out more examples in <gh-file:examples/online_serving>.
+
+!!! note
+    - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
+    - Implementing `get_num_audio_tokens` improves accuracy of streaming usage metrics (`prompt_tokens`) without an extra forward pass.
+    - For multilingual behavior, keep `supported_languages` aligned with actual model capabilities.
--- a/docs/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@ -1,41 +1,53 @@
-# Anything LLM
+# AnythingLLM

-[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
+[AnythingLLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.

 It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.

 ## Prerequisites

- Setup vLLM environment
+Set up the vLLM environment:
+
+```bash
+pip install vllm
+```

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with a supported chat-completion model, for example:

-```bash
-vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
-```
+    ```bash
+    vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
+    ```

- Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
+1. Download and install [AnythingLLM Desktop](https://anythingllm.com/desktop).

- On the bottom left of open settings, AI Providers --> LLM:
-    - LLM Provider: Generic OpenAI
-    - Base URL: http://{vllm server host}:{vllm server port}/v1
-    - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
+1. Configure the AI provider:

-![](../../assets/deployment/anything-llm-provider.png)
+    - At the bottom, click the 🔧 wrench icon -> **Open settings** -> **AI Providers** -> **LLM**.
+    - Enter the following values:
+        - LLM Provider: Generic OpenAI
+        - Base URL: `http://{vllm server host}:{vllm server port}/v1`
+        - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`

- Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
+    ![set AI providers](../../assets/deployment/anything-llm-provider.png)

-![](../../assets/deployment/anything-llm-chat-without-doc.png)
+1. Create a workspace:

- Click the upload button:
-    - upload the doc
-    - select the doc and move to the workspace
-    - save and embed
+    1. At the bottom, click the ↺ back icon and back to workspaces.
+    1. Create a workspace (e.g., `vllm`) and start chatting.

-![](../../assets/deployment/anything-llm-upload-doc.png)
+    ![create a workspace](../../assets/deployment/anything-llm-chat-without-doc.png)

- Chat again:
+1. Add a document.

-![](../../assets/deployment/anything-llm-chat-with-doc.png)
+    1. Click the 📎 attachment icon.
+    1. Upload a document.
+    1. Select and move the document into your workspace.
+    1. Save and embed it.
+
+    ![add a document](../../assets/deployment/anything-llm-upload-doc.png)
+
+1. Chat using your document as context.
+
+    ![chat with your context](../../assets/deployment/anything-llm-chat-with-doc.png)
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@ -4,9 +4,7 @@

 ## Prerequisites

- Setup vLLM environment
-
- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment
+Set up the vLLM and [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment:

 ```bash
 pip install vllm
@ -18,14 +16,14 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-python -m vllm.entrypoints.openai.api_server \
-    --model mistralai/Mistral-7B-Instruct-v0.2
-```
+    ```bash
+    python -m vllm.entrypoints.openai.api_server \
+        --model mistralai/Mistral-7B-Instruct-v0.2
+    ```

- Call it with AutoGen:
+1. Call it with AutoGen:

 ??? code

--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@ -6,27 +6,31 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac

 ## Prerequisites

- Setup vLLM environment
+Set up the vLLM environment:
+
+```bash
+pip install vllm
+```

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-vllm serve qwen/Qwen1.5-0.5B-Chat
-```
+    ```bash
+    vllm serve qwen/Qwen1.5-0.5B-Chat
+    ```

- Download and install [Chatbox desktop](https://chatboxai.app/en#download).
+1. Download and install [Chatbox desktop](https://chatboxai.app/en#download).

- On the bottom left of settings, Add Custom Provider
+1. On the bottom left of settings, Add Custom Provider
    - API Mode: `OpenAI API Compatible`
    - Name: vllm
    - API Host: `http://{vllm server host}:{vllm server port}/v1`
    - API Path: `/chat/completions`
    - Model: `qwen/Qwen1.5-0.5B-Chat`

-![](../../assets/deployment/chatbox-settings.png)
+    ![](../../assets/deployment/chatbox-settings.png)

- Go to `Just chat`, and start to chat:
+1. Go to `Just chat`, and start to chat:

-![](../../assets/deployment/chatbox-chat.png)
+    ![](../../assets/deployment/chatbox-chat.png)
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@ -8,44 +8,50 @@ This guide walks you through deploying Dify using a vLLM backend.

 ## Prerequisites

- Setup vLLM environment
- Install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/)
+Set up the vLLM environment:
+
+```bash
+pip install vllm
+```
+
+And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/).

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-vllm serve Qwen/Qwen1.5-7B-Chat
-```
+    ```bash
+    vllm serve Qwen/Qwen1.5-7B-Chat
+    ```

- Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):
+1. Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):

-```bash
-git clone https://github.com/langgenius/dify.git
-cd dify
-cd docker
-cp .env.example .env
-docker compose up -d
-```
+    ```bash
+    git clone https://github.com/langgenius/dify.git
+    cd dify
+    cd docker
+    cp .env.example .env
+    docker compose up -d
+    ```

- Open the browser to access `http://localhost/install`, config the basic login information and login.
+1. Open the browser to access `http://localhost/install`, config the basic login information and login.

- In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
+1. In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
+
+1. Fill in the model provider details as follows:

- Fill in the model provider details as follows:
    - **Model Type**: `LLM`
    - **Model Name**: `Qwen/Qwen1.5-7B-Chat`
    - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1`
    - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
    - **Completion Mode**: `Completion`

-![](../../assets/deployment/dify-settings.png)
+    ![](../../assets/deployment/dify-settings.png)

- To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
+1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:

-![](../../assets/deployment/dify-create-chatbot.png)
+    ![](../../assets/deployment/dify-create-chatbot.png)

- Click the chatbot you just created to open the chat interface and start interacting with the model:
+1. Click the chatbot you just created to open the chat interface and start interacting with the model:

-![](../../assets/deployment/dify-chat.png)
+    ![](../../assets/deployment/dify-chat.png)
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@ -6,7 +6,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac

 ## Prerequisites

- Setup vLLM and Haystack environment
+Set up the vLLM and Haystack environment:

 ```bash
 pip install vllm haystack-ai
@ -14,13 +14,13 @@ pip install vllm haystack-ai

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-vllm serve mistralai/Mistral-7B-Instruct-v0.1
-```
+    ```bash
+    vllm serve mistralai/Mistral-7B-Instruct-v0.1
+    ```

- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
+1. Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.

 ??? code

--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@ -13,7 +13,7 @@ And LiteLLM supports all models on VLLM.

 ## Prerequisites

- Setup vLLM and litellm environment
+Set up the vLLM and litellm environment:

 ```bash
 pip install vllm litellm
@ -23,13 +23,13 @@ pip install vllm litellm

 ### Chat completion

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-vllm serve qwen/Qwen1.5-0.5B-Chat
-```
+    ```bash
+    vllm serve qwen/Qwen1.5-0.5B-Chat
+    ```

- Call it with litellm:
+1. Call it with litellm:

 ??? code

@ -51,13 +51,13 @@ vllm serve qwen/Qwen1.5-0.5B-Chat

 ### Embeddings

- Start the vLLM server with the supported embedding model, e.g.
+1. Start the vLLM server with the supported embedding model, e.g.

-```bash
-vllm serve BAAI/bge-base-en-v1.5
-```
+    ```bash
+    vllm serve BAAI/bge-base-en-v1.5
+    ```

- Call it with litellm:
+1. Call it with litellm:

 ```python
 from litellm import embedding   
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@ -11,7 +11,7 @@ Here are the integrations:

 ### Prerequisites

- Setup vLLM and langchain environment
+Set up the vLLM and langchain environment:

 ```bash
 pip install -U vllm \
@ -22,33 +22,33 @@ pip install -U vllm \

 ### Deploy

- Start the vLLM server with the supported embedding model, e.g.
+1. Start the vLLM server with the supported embedding model, e.g.

-```bash
-# Start embedding service (port 8000)
-vllm serve ssmits/Qwen2-7B-Instruct-embed-base
-```
+    ```bash
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+    ```

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-# Start chat service (port 8001)
-vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
-```
+    ```bash
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+    ```

- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
+1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>

- Run the script
+1. Run the script

-```python
-python retrieval_augmented_generation_with_langchain.py
-```
+    ```python
+    python retrieval_augmented_generation_with_langchain.py
+    ```

 ## vLLM + llamaindex

 ### Prerequisites

- Setup vLLM and llamaindex environment
+Set up the vLLM and llamaindex environment:

 ```bash
 pip install vllm \
@ -60,24 +60,24 @@ pip install vllm \

 ### Deploy

- Start the vLLM server with the supported embedding model, e.g.
+1. Start the vLLM server with the supported embedding model, e.g.

-```bash
-# Start embedding service (port 8000)
-vllm serve ssmits/Qwen2-7B-Instruct-embed-base
-```
+    ```bash
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+    ```

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-# Start chat service (port 8001)
-vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
-```
+    ```bash
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+    ```

- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
+1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>

- Run the script
+1. Run the script:

-```python
-python retrieval_augmented_generation_with_llamaindex.py
-```
+    ```python
+    python retrieval_augmented_generation_with_llamaindex.py
+    ```
--- a/docs/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@ -6,35 +6,33 @@ It can be quickly integrated with vLLM as a backend API server, enabling powerfu

 ## Prerequisites

- Setup vLLM environment
+Set up the vLLM environment by installing all required packages:
+
+```bash
+pip install vllm streamlit openai
+```

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with a supported chat completion model, e.g.

-```bash
-vllm serve qwen/Qwen1.5-0.5B-Chat
-```
+    ```bash
+    vllm serve Qwen/Qwen1.5-0.5B-Chat
+    ```

- Install streamlit and openai:
+1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>

-```bash
-pip install streamlit openai
-```
+1. Start the streamlit web UI and start to chat:

- Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
-
- Start the streamlit web UI and start to chat:
-
-```bash
-streamlit run streamlit_openai_chatbot_webserver.py
-
-# or specify the VLLM_API_BASE or VLLM_API_KEY
-VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \
+    ```bash
    streamlit run streamlit_openai_chatbot_webserver.py

-# start with debug mode to view more details
-streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
-```
+    # or specify the VLLM_API_BASE or VLLM_API_KEY
+    VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \
+        streamlit run streamlit_openai_chatbot_webserver.py

-![](../../assets/deployment/streamlit-chat.png)
+    # start with debug mode to view more details
+    streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
+    ```
+
+    ![Chat with vLLM assistant in Streamlit](../../assets/deployment/streamlit-chat.png)
--- a/docs/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@ -1,31 +1,31 @@
 # Integration with Hugging Face

-This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
+This document describes how vLLM integrates with Hugging Face libraries. We will explain step by step what happens under the hood when we run `vllm serve`.

-Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`.
+Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qwen2-7B`.

 1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process:
    - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
-    - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works.
-    - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
+    - If the `model` argument is a Hugging Face model ID consisting of a username and model name, vLLM will first try to use the config file from the Hugging Face local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the Hugging Face cache works.
+    - If the `model` argument is a Hugging Face model ID but it is not found in the cache, vLLM will download the config file from the Hugging Face model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.

 2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.

 3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
-    - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
-    - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
+    - Hugging Face also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, Hugging Face will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
+    - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, Hugging Face will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.

 4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation.

 5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs.

-Beyond that, there are two more things vLLM depends on HuggingFace for.
+Beyond that, there are two more things vLLM depends on Hugging Face for.

-1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
+1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).

-2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
+2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
    - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:

-This completes the integration between vLLM and HuggingFace.
+This completes the integration between vLLM and Hugging Face.

-In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
+In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the Hugging Face model hub or a local directory. It uses the config class from either vLLM, Hugging Face transformers, or loads the config class from the model's repository.
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@ -0,0 +1,559 @@
+# Logits Processors
+
+!!! important
+    Some logits processors design changes are still in progress and the API may
+    change in the near future. We hope to stabilize this part of the API soon
+
+This document describes how the vLLM engine interacts with logits processors, and the programming model which vLLM supports for implementing logits processors.
+
+## Logits Processors Background
+
+A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior.
+
+In vLLM, logits processors operate at batch granularity. During a given engine step, the logits processor consumes a `(num_requests) x (vocab_size)` tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax.  
+
+## Logits Processors in the vLLM engine
+
+The vLLM engine's persistent batch data structure maintains a list of loaded logits processors.
+
+In order to operate on the entire batch at once, each logits processor may maintain metadata about the requests in the batch (i.e. each request's logits-processor-specific configuration settings). Therefore, logits processors are stateful.
+
+In each engine step, the vLLM engine will (1) update each logits processor's internal state and (2) apply logits processors to the model output logits.
+
+### Updating Logits Processor Internal State
+
+At the beginning of each engine step, the persistent batch may add, discard and/or reorder requests in response to the scheduler output. After the persistent batch has reorganized, the vLLM engine invokes each logits processor's `update_state()` method. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the beginning of the engine step.
+
+The pseudocode below shows the process by which the vLLM persistent batch notifies each logits processor of changes in batch state:
+
+??? code "Model Runner Updates Logits Processor States"
+
+    ``` python
+    # gpu_model_runner.py
+
+    class GPUModelRunner(...):
+
+        ...
+
+        def execute_model(self, scheduler_output, ...):
+            self._update_states(scheduler_output)
+
+            ...
+
+        def _update_states(...):
+
+            ...
+
+            # ...update persistent batch to reflect new/finished requests & reordering
+            # of requests within batch...
+
+            ...
+
+            self.input_batch.refresh_metadata()
+
+
+    # gpu_input_batch.py
+
+    class InputBatch:
+
+        ...
+
+        def refresh_metadata(self):
+
+            ...
+
+            # Update each logits processor's state to reflect persistent batch state
+            batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
+            for logit_proc in self.logitsprocs.all:
+                logit_proc.update_state(batch_update)
+
+            ...
+
+
+    # vllm/v1/sample/logits_processor/interface.py
+
+    @dataclass(frozen=True)
+    class BatchUpdate:
+        # Batch state-change data structure which is passed to logits processors'
+        # update_state() methods
+
+        batch_size: int
+
+        removed: Sequence[RemovedRequest]
+        added: Sequence[AddedRequest]
+        moved: Sequence[MovedRequest]
+    
+    ```
+
+### Applying Logits Processors to the Model Output Logits
+
+After updating persistent batch state, the vLLM model runner performs model inference to obtain logits. Then, the model runner invokes the sampler against the logits. In turn, part of the sampler's operation is to invoke the logits processors' `apply()` methods against the model output logit processors, yielding transformed logits (the `apply()` methods may modify the logits in-place or out-of-place, although in-place is more memory-efficient). This process is shown in the pseudocode below.
+
+Note that the sampler will access the logits processors via `SamplingMetadata.logitsprocs`. When the vLLM engine constructs `SamplingMetadata` (not shown in the code below), the reference to the list of logits processors is passed from the persistent batch data structure to `SamplingMetadata`.
+
+??? code "Apply logits processors to model output logits"
+
+    ``` python
+    # gpu_model_runner.py
+
+    class GPUModelRunner(...):
+
+        ...
+
+        def execute_model(self, scheduler_output, ...):
+            # (discussed in previous section)
+            self._update_states(scheduler_output)
+
+            ...
+
+            # ...run model inference to obtain logits...
+
+            ...
+
+            # Invoke sampler, which applies logits processors
+            sampler_output = self.sampler(logits=logits,
+                                          sampling_metadata=sampling_metadata)
+
+            ...
+
+
+    # sampler.py
+
+    class Sampler(nn.Module):
+
+        ...
+
+        def forward(self, logits, sampling_metadata):
+
+            ...
+
+            # Apply non-argmax-invariant logits processors to model output logits
+            for processor in (sampling_metadata.logitsprocs.non_argmax_invariant):
+                logits = processor.apply(logits)
+
+            sampled = self.sample(logits, sampling_metadata)
+
+            ...
+
+            # ...return sampler output data structure...
+
+
+        def sample(self, logits, sampling_metadta)
+
+            ...
+
+            # ...exit early if all requests are greedy-sampling...
+
+            ...
+
+            # Apply argmax-invariant logits processors
+            for processor in sampling_metadata.logitsprocs.argmax_invariant:
+                logits = processor.apply(logits)
+
+            ...
+
+            # ...perform sampling and return sampling result...
+    ``` 
+
+At sampling time, the sampler checks whether all requests in the persistent batch employ greedy sampling. If that is the case, the sampler saves compute by skipping "argmax-invariant" logits processors. Here, "argmax" is shorthand for the token ID with the highest logit value in a given row of the logits tensor (i.e. the token which the model weighted the highest for a given request).
+
+* An **argmax-invariant logits processor** is a logits processor (such as Min-P) which does not modify the argmax. For example, a logits processor which masks out the lowest-probability tokens will not change which token ID has the max logit. Greedy sampling always picks the highest-logit-value token ID, and so conceptually an argmax-invariant logits processor can be skipped for greedy sampling requests.
+
+* A **non-argmax-invariant logits processor** is a logits processor which may modify the argmax. For example, a logits processor which masks all tokens except for EOS after a certain number of steps in order to force decoding to terminate might end up masking the max-logit-value token and therefore change the argmax. Conceptually, these logits processors cannot be skipped for greedy sampling requests.
+
+The vLLM logits processor abstraction requires the engine to apply logits processors at batch granularity; therefore in practice the argmax-invariant logits processors can only be skipped when the entire batch uses greedy sampling.
+
+## Logits Processor Programming Model
+
+The previous sections alluded to the interfaces which vLLM logits processors must support. This section introduces in full the programming model for implementing logits processors that are compatible with the vLLM engine, including the `LogitsProcessor` base class and its interface methods as well as the `BatchUpdate` data structure for representing persistent batch state changes, both of which are shown in the code below:
+
+??? code "`LogitsProcessor` base class and `BatchUpdate` data structure"
+
+    ``` python
+    from abc import ABC, abstractmethod
+    from collections.abc import Sequence
+    from dataclasses import dataclass
+    from enum import Enum, auto
+    from typing import TYPE_CHECKING, Optional
+
+    import torch
+
+    from vllm import SamplingParams
+
+    if TYPE_CHECKING:
+        from vllm.config import VllmConfig
+
+
+    class MoveDirectionality(Enum):
+        # One-way i1->i2 req move within batch
+        UNIDIRECTIONAL = auto()
+        # Two-way i1<->i2 req swap within batch
+        SWAP = auto()
+
+
+    # (index, params, prompt_tok_ids, output_tok_ids) tuples for new
+    # requests added to the batch.
+    AddedRequest = tuple[int, SamplingParams, list[int], list[int]]
+
+    # (index 1, index 2, directionality) tuples representing
+    # one-way moves or two-way swaps of requests in batch
+    MovedRequest = tuple[int, int, MoveDirectionality]
+
+    # Batch indices of any removed requests.
+    RemovedRequest = int
+
+
+    @dataclass(frozen=True)
+    class BatchUpdate:
+        """Persistent batch state change info for logitsprocs"""
+        batch_size: int  # Current num reqs in batch
+
+        # Metadata for requests added to, removed from, and moved
+        # within the persistent batch.
+        #
+        # Key assumption: the `output_tok_ids` list (which is an element of each
+        # tuple in `added`) is a reference to the request's running output tokens
+        # list; via this reference, the logits processors always see the latest
+        # list of generated output tokens
+        removed: Sequence[RemovedRequest]
+        moved: Sequence[MovedRequest]
+        added: Sequence[AddedRequest]
+
+
+    class LogitsProcessor(ABC):
+
+        @abstractmethod
+        def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                    is_pin_memory: bool) -> None:
+            raise NotImplementedError
+
+        @abstractmethod
+        def apply(self, logits: torch.Tensor) -> torch.Tensor:
+            raise NotImplementedError
+
+        @abstractmethod
+        def is_argmax_invariant(self) -> bool:
+            """True if logits processor has no impact on the
+            argmax computation in greedy sampling.
+            NOTE: may or may not have the same value for all
+            instances of a given LogitsProcessor subclass,
+            depending on subclass implementation.
+            """
+            raise NotImplementedError
+
+        @abstractmethod
+        def update_state(
+            self,
+            batch_update: Optional["BatchUpdate"],
+        ) -> None:
+            """Called when there are new output tokens, prior
+            to each forward pass.
+
+            Args:
+                batch_update is non-None iff there have been
+                changes to the batch makeup.
+            """
+            raise NotImplementedError
+            
+    ```
+
+A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) the following methods:
+
+* `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)`
+    * `vllm_config`: engine configuration data structure
+    * `device`: hardware accelerator device info
+    * `is_pin_memory`: flag indicating whether pin memory is available to support logits processor implementation
+
+* `apply(self, logits: torch.Tensor) -> torch.Tensor`:
+    * Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`)
+    * Apply logits processor transformation at batch granularity
+    * Return a transformed `(num_requests) x (vocab_size)` logits tensor
+    * You can modify the input logits processors in-place or out-of-place; in-place is more memory-efficient
+
+* `is_argmax_invariant(self) -> bool`:
+    * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
+    * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
+
+* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
+    * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
+    * Use the `BatchUpdate` members to update logits processor internal state
+    * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
+
+### `BatchUpdate` data structure
+
+The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (note that the order in which the operations are mentioned below reflects the order in which they should be processed in `update_state()`):
+
+* **Remove:** remove (without replacement) request at index `i`
+
+    * A Remove is represented in `Batchupdate.removed` by an `int` (representing `i`)
+
+    * Effect of remove-at-index on batch:
+
+        ``` text
+        Batch: [A,B,C]
+        Remove @ i:  1
+
+        =>
+
+        New Batch: [A,x,C] # Discard B and leave an empty slot
+        ```
+
+* **Add:** add (or replace existing request with) a new request at index `i`. If a request is replaced, its associated state should be discarded.
+
+    * An Add is represented in `Batchupdate.added` as a tuple of
+
+        ``` text
+        (index, new request SamplingParams, prompt token ids, output token ids)
+        ```
+
+    * `prompt token ids` and `output token ids` are references to the request's prompt token ids and output token ids lists, respectively. Note that the output token ids list grows with each engine step, and this growth is visible to the logits processor because output token ids are passed by reference. **This is important for LogitsProcessors that take into account the tokens generated so far**.
+
+    * The implementation of the particular logits processor subclass determines whether or how the fields in the added request tuple are digested into an internal representation. For example, a logits processor that does not utilize prompt or output token ids may only need to utilize `index` and `SamplingParams` and discard the other tuple fields
+
+    * If index `i` currently holds a request, a replacement occurs:
+
+        ``` text
+        Batch: [A,B,C]
+        New request to be added @ i: D @ 1
+
+        =>
+
+        New Batch: [A,D,C] # Add D, discard B
+        ```
+
+    * If index `i` does not currently hold a request (because `i` is out of bounds of the current batch size):
+
+        ``` text
+        Batch: [A,B,C]
+        New request to be added @ i: D @ 3
+
+        =>
+
+        New Batch: [A,B,C,D] # Add D, extending batch
+        ```
+
+* **Move:** move request at index `s` to index `d` OR swap requests at indices `s` and `d`
+
+    * A Move is represented in `Batchupdate.moved` as a tuple of
+
+        ``` text
+        (s, d, UNIDIRECTIONAL or SWAP)
+        ```
+
+    * If the Move specifies `UNIDRECTIONAL`:
+
+        * The request at index `s` is moved to index `d`; index `s` becomes an empty slot
+
+            ``` text
+            Batch: [A,x,C,D]
+            Unidirectionally Move s -> d:  3 -> 1
+
+            =>
+
+            New Batch: [A,D,C,x] # Move D to 1, leaving empty slot at 3
+            ```
+
+        * If another request already resided at index `d`, it is replaced and discarded
+
+            ``` text
+            Batch: [A,B,C,D]
+            Unidirectionally Move s -> d:  3 -> 1
+
+            =>
+
+            New Batch: [A,D,C,x] # Move D to 1, discarding B and leaving empty slot at 3
+            ```
+
+    * If the Move specifies `SWAP`, the requests at `s` and `d` exchange indices
+
+        ``` text
+        Batch: [A,B,C,D]
+        Swap Move s <-> d:  3 <-> 1
+
+        =>
+
+        New Batch: [A,D,C,B] # Swap B and D
+        ```
+
+Additionally, the `BatchUpdate` data structure includes a representation (`batch_size`) of the size of the persistent batch at the beginning of the engine step.
+
+### How the vLLM engine builds the `BatchUpdate` data structure
+
+Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction):
+
+1. Identify indices of requests which finished in the current engine step
+
+2. Identify new requests introduced in the current step
+
+3. Use Add operations to replace as many finished requests with new requests, in order of increasing index of the replaced request starting with the lowest index
+
+4. Based on the relative number of new and finished requests:
+
+    1. If the numbers of new and finished requests are the same, proceed to next step
+
+    2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1`
+
+    3. *If there are fewer new requests than finished requests:*
+
+        * Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state
+
+        * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
+
+        * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
+
+5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
+
+Notes:
+
+* A logits processor `update_state()` method must process batch update operations in the following order: removes, adds, moves
+
+* The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations
+    * Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3
+    * In other words Move operations can be assumed to be applied after Adds and Removes
+
+* Move operations can be assumed to be applied in the order in which they appear in `BatchUpdate.moved`
+
+* If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None`
+
+#### Example: Batch Update with Fewer New Requests Than Finished Requests
+
+The following example models an engine step where 1 new request is introduced and 2 finished requests are eliminated, additionally the attention backend performs a swap to optimize the batch ordering.
+
+``` text
+Batch state (beginning of engine step): [A,B,C,D]
+Batch size: 4
+
+New requests: E
+
+Finished requests: A, C
+
+Processing steps (using BatchUpdate abstraction):
+
+1. Add E at index 0
+
+[E,B,C,D] # Discard A
+Batch size: 4
+
+2. Remove at index 2
+
+[E,B,x,D] # Discard C, empty slot at index 2
+Batch size: 4
+
+3. Condense batch with a Unidirectional Move 3 -> 2 operation and shrink batch
+
+[E,B,D] x # Empty slot is now outside batch
+Batch size: 3
+
+4. Attention backend optimization: reorder batch with Swap 0 <-> 1
+
+[B,E,D]
+Batch size: 3
+
+```
+
+The resulting `BatchUpdate` data structure will look like
+
+``` text
+BatchUpdate instance
+* added: [(0,E's SamplingParams,E's prompt tokens ref,E's output tokens ref)]
+* removed: [2] # request C was removed without replacement
+* moved: [(3,2,UNIDIRECTIONAL),(0,1,SWAP)]
+```
+
+#### Example: Batch Update with More New Requests Than Finished Requests
+
+The following example models an engine step where 2 new requests are introduced and 1 finished request is eliminated, additionally the attention backend performs a swap to optimize the batch ordering.
+
+``` text
+Batch state (beginning of engine step): [A,B,C,D]
+Batch size: 4
+
+New requests: E,F
+
+Finished requests: C
+
+Processing steps (using BatchUpdate abstraction):
+
+1. Add E at index 2
+
+[A,B,E,D] # Discard C
+Batch size: 4
+
+2. Add F at index 4 (current max batch index + 1)
+
+[A,B,E,D,F] # Extend batch by 1
+Batch size: 5
+
+4. Attention backend optimization: reorder batch with Swap 0 <-> 1
+
+[B,A,E,D,F]
+Batch size: 5
+
+```
+
+Note that batch condensation is skipped because there are no empty slots left behind by Remove operations.
+
+The resulting `BatchUpdate` data structure will look like
+
+``` text
+BatchUpdate instance
+* added: [(2,E's SamplingParams,E's prompt tokens ref,E's output tokens ref),(4,F's SamplingParams,F's prompt tokens ref,F's output tokens ref)]
+* removed: [] # no requests were removed without replacement
+* moved: [(0,1,SWAP)]
+```
+
+## How to Introduce a New Logits Processor to vLLM
+
+### Best Practices for Writing Built-In Logits Processors
+
+* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
+    * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
+    * However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor
+
+* It is up to the logits processor author to determine:
+
+    1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` and the vLLM REST API
+
+    2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the built-in logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor
+
+    3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the built-in logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the built-in logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor
+
+        * Additionally, an easy way to save compute in `update_state()` is to exit early when the batch_update is `None`
+
+* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove)
+
+* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
+
+### Built-In Logits Processors
+
+Built-in logits processors are always loaded when the vLLM engine starts. See the existing vLLM built-in logits processors in `vllm/v1/sample/logits_processor/builtin.py` for examples of how to write a new built-in vLLM logits processor. It makes sense to write a PR to introduce a new logits processor as a built-in if it is likely to be useful to a wide audience. vLLM currently employs the following built-in logits processors based on the programming model described above:
+
+* Min-P
+
+* Logit bias
+
+* Min-tokens
+
+Review these logits processor implementations for guidance on writing built-in logits processors.
+
+Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model.
+
+* Allowed token IDs
+
+* Bad words
+
+* Repetition penalty
+
+* Frequency penalty
+
+* Presence penalty
+
+* Temperature
+
+* Top-K
+
+* Top-P
+
+### Custom Logits Processors
+
+vLLM can be augmented with [user-provided custom logits processors](../features/custom_logitsprocs.md).
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@ -8,7 +8,7 @@ page for information on known issues and how to solve them.
 ## Introduction

 !!! important
-    The source code references are to the state of the code at the time of writing in December, 2024.
+    The source code references are to the state of the code at the time of writing in December 2024.

 The use of Python multiprocessing in vLLM is complicated by:

--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@ -2,6 +2,6 @@

 vLLM's examples are split into three categories:

- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference)
- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving)
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others)
+- If you are using vLLM from within Python code, see the *Offline Inference* section.
+- If you are using vLLM from an HTTP application or client, see the *Online Serving* section.
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the *Others* section.
--- a/docs/features/README.md
+++ b/docs/features/README.md
@ -76,6 +76,3 @@ th:not(:first-child) {
 | multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     | ❌ |
 | best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-
-!!! note
-    Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware
--- a/docs/features/custom_arguments.md
+++ b/docs/features/custom_arguments.md
@ -0,0 +1,46 @@
+# Custom Arguments
+
+You can use vLLM *custom arguments* to pass in arguments which are not part of the vLLM `SamplingParams` and REST API specifications. Adding or removing a vLLM custom argument does not require recompiling vLLM, since the custom arguments are passed in as a dictionary.
+
+Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
+
+## Offline Custom Arguments
+
+Custom arguments passed to `SamplingParams.extra_args` as a `dict` will be visible to any code which has access to `SamplingParams`:
+
+``` python
+SamplingParams(extra_args={"your_custom_arg_name": 67})
+```
+
+This allows arguments which are not already part of `SamplingParams` to be passed into `LLM` as part of a request.
+
+## Online Custom Arguments
+
+The vLLM REST API allows custom arguments to be passed to the vLLM server via `vllm_xargs`. The example below integrates custom arguments into a vLLM REST API request:
+
+``` bash
+curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-1.5B-Instruct",
+        ...
+        "vllm_xargs": {"your_custom_arg": 67}
+    }'
+```
+
+Furthermore, OpenAI SDK users can access `vllm_xargs` via the `extra_body` argument:
+
+``` python
+batch = await client.completions.create(
+    model="Qwen/Qwen2.5-1.5B-Instruct",
+    ...,
+    extra_body={
+        "vllm_xargs": {
+            "your_custom_arg": 67
+        }
+    }
+)
+```
+
+!!! note
+    `vllm_xargs` is assigned to `SamplingParams.extra_args` under the hood, so code which uses `SamplingParams.extra_args` is compatible with both offline and online scenarios.
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@ -0,0 +1,445 @@
+# Custom Logits Processors
+
+!!! important
+    Some logits processors design changes are still in progress and the API may
+    change in the near future. We hope to stabilize this part of the API soon
+
+A "custom" logits processor is written by a user of vLLM and is loaded into vLLM at initialization without needing to modify or recompile the vLLM source code. It is the opposite of a built-in logits processor.
+
+This document shows how to write, load and use a custom logits processor.
+
+## Logits Processors Background
+
+A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior.
+
+In vLLM, logits processors operate at batch granularity. During a given engine step, the logits processor consumes a `(num_requests) x (vocab_size)` tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax.  
+
+## Creating a Custom Logits Processor
+
+Custom logits processors must subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and define (at minimum) the following methods:
+
+* `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)`
+    * `vllm_config`: engine configuration data structure
+    * `device`: hardware accelerator device info
+    * `is_pin_memory`: flag indicating whether pin memory is available to support logits processor implementation
+
+* `apply(self, logits: torch.Tensor) -> torch.Tensor`:
+    * Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`)
+    * Apply logits processor transformation at batch granularity
+    * Return a transformed `(num_requests) x (vocab_size)` logits tensor
+    * You can modify the input logits processors in-place or out-of-place; in-place is more memory-efficient
+
+* `is_argmax_invariant(self) -> bool`:
+    * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
+    * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
+
+* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
+    * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
+    * Use the `BatchUpdate` members to update logits processor internal state
+    * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
+
+### How the vLLM engine builds the `BatchUpdate` data structure
+
+!!! important
+    Some logits processors design changes are still in progress. We expect
+    that in the future you will not need to account for batch state changes
+    when implementing a logits processor, and the information in this section
+    will become irrelevant.
+
+Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction):
+
+1. Identify indices of requests which finished in the current engine step
+
+2. Identify new requests introduced in the current step
+
+3. Use Add operations to replace as many finished requests with new requests, in order of increasing index of the replaced request starting with the lowest index
+
+4. Based on the relative number of new and finished requests:
+
+    1. If the numbers of new and finished requests are the same, proceed to next step
+
+    2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1`
+
+    3. *If there are fewer new requests than finished requests:*
+
+        * Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state
+
+        * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
+
+        * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
+
+5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
+
+Notes:
+
+* A logits processor `update_state()` method must process batch update operations in the following order: removes, adds, moves
+
+* The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations
+    * Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3
+    * In other words Move operations can be assumed to be applied after Adds and Removes
+
+* Move operations can be assumed to be applied in the order in which they appear in `BatchUpdate.moved`
+
+* If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None`
+
+### Passing Custom Argument to a Custom Logits Processor
+
+Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although you are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.)
+
+### Example Custom Logits Processor Implementation
+
+The contrived example below implements a custom logits processor which consumes a `(num\_requests) \times (vocab\_size)` logits tensor and masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. To determine whether the logits processor is enabled and which token to leave unmasked, the logits processor checks `SamplingParams.extra_args` for a `target_token` custom argument associated with each request:
+
+??? code "Example custom logits processor definition"
+
+    ``` python
+    from typing import Optional
+    import torch
+    from vllm.config import VllmConfig
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.sample.logits_processor import (BatchUpdate,
+                                                LogitsProcessor,
+                                                MoveDirectionality)
+
+    class DummyLogitsProcessor(LogitsProcessor):
+        """Fake logit processor to support unit testing and examples"""
+
+        def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                    is_pin_memory: bool):
+            self.req_info: dict[int, int] = {}
+
+        def is_argmax_invariant(self) -> bool:
+            """Never impacts greedy sampling"""
+            return False
+
+        def update_state(self, batch_update: Optional[BatchUpdate]):
+            if not batch_update:
+                return
+
+            # Process added requests.
+            for index, params, _, _ in batch_update.added:
+                assert params is not None
+                if params.extra_args and (target_token :=
+                                        params.extra_args.get("target_token")):
+                    self.req_info[index] = target_token
+                else: 
+                    self.req_info.pop(index, None)
+
+            if self.req_info:
+                # Process removed requests.
+                for index in batch_update.removed:
+                    self.req_info.pop(index, None)
+
+                # Process moved requests, unidirectional move (a->b) and swap
+                # (a<->b)
+                for adx, bdx, direct in batch_update.moved:
+                    a_val = self.req_info.pop(adx, None)
+                    b_val = self.req_info.pop(bdx, None)
+                    if a_val is not None:
+                        self.req_info[bdx] = a_val
+                    if direct == MoveDirectionality.SWAP and b_val is not None:
+                        self.req_info[adx] = b_val
+
+        def apply(self, logits: torch.Tensor) -> torch.Tensor:
+            if not self.req_info:
+                return logits
+
+            # Save target values before modification
+            cols = torch.tensor(
+                list(self.req_info.values()), dtype=torch.long, device=logits.device
+            )
+            rows = torch.tensor(
+                list(self.req_info.keys()), dtype=torch.long, device=logits.device
+            )
+            values_to_keep = logits[rows, cols].clone()
+
+            # Mask all but target tokens
+            logits[rows] = float('-inf')
+            logits[rows, cols] = values_to_keep
+
+            return logits
+    ```
+
+In the rest of this document, we will use `DummyLogitsProcessor` as an example of a custom logits processor.
+
+The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" representation of the batched requests in the `self.req_info` dictionary: only those requests which specify a `target_token` value have a key in the dictionary. `update_state()` adjusts the stored request indices and `target_token` values (keys and values respectively in `self.req_info`) in response to Add, Remove and Move operations against the persistent batch.
+
+### Wrapping an Existing Request-Level Logits Processor
+
+Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. This will be especially true if your logits processor was developed for vLLM version 0, which required it to be a `Callable` (as described [here](https://docs.vllm.ai/en/v0.10.1.1/api/vllm/logits_process.html)) conforming to the following type annotation:
+
+``` python
+RequestLogitsProcessor = Union[
+
+    # (output token ids, logits tensor) -> logits tensor
+    Callable[[list[int], Tensor], Tensor],
+
+    # (prompt token ids, output token ids, logits tensor) -> logits tensor
+    Callable[[list[int], list[int], Tensor], Tensor],
+]
+```
+
+While request-level logits processors are explicitly *not* supported in the vLLM engine, vLLM *does* provide a convenient process to wrap an existing `Callable` request-level logits processor and create a batch-level logits processor that is compatible with vLLM. The `Callable` must conform to the type annotation above; if your request-level logits processor has a different interface, then in order to wrap it, you may need to modify it or implement an additional wrapper layer to comply with the interface specification above.
+
+You can wrap the request-level logits processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.) Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit. Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance:
+
+??? code "Example of Wrapping a Request-Level Logits Processor"
+
+    ``` python
+    ...
+
+    from vllm.v1.sample.logits_processor import (
+        AdapterLogitsProcessor, # Wrapper base-class
+        RequestLogitsProcessor, # Request-level logitsproc type annotation
+    )
+
+    ...
+
+    # Stand-in for your request-level logits processor:
+    class DummyPerReqLogitsProcessor:
+        """The request-level logits processor masks out all logits except the
+        token id identified by `target_token`"""
+
+        def __init__(self, target_token: int) -> None:
+            """Specify `target_token`"""
+            self.target_token = target_token
+
+        def __call__(
+            self,
+            output_ids: list[int],
+            logits: torch.Tensor,
+        ) -> torch.Tensor:
+            val_to_keep = logits[self.target_token].item()
+            logits[:] = float("-inf")
+            logits[self.target_token] = val_to_keep
+            return logits
+
+    ...
+
+    # Example of wrapping the request-level logits processor:
+    class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+        """Example of wrapping a fake request-level logit processor to create a
+        batch-level logits processor"""
+
+        def is_argmax_invariant(self) -> bool:
+            return False
+
+        def new_req_logits_processor(
+            self,
+            params: SamplingParams,
+        ) -> Optional[RequestLogitsProcessor]:
+            """This method returns a new request-level logits processor, customized
+            to the `target_token` value associated with a particular request.
+
+            Returns None if the logits processor should not be applied to the
+            particular request. To use the logits processor the request must have
+            a "target_token" custom argument with an integer value.
+
+            Args:
+            params: per-request sampling params
+
+            Returns:
+            `Callable` request logits processor, or None
+            """
+            target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+                "target_token"
+            )
+            if target_token is None:
+                return None
+            if not isinstance(target_token, int):
+                logger.warning(
+                    "target_token value %s is not int; not applying logits"
+                    " processor to request.",
+                    target_token,
+                )
+                return None
+            return DummyPerReqLogitsProcessor(target_token)
+    ```
+
+!!! note
+    Your `new_req_logits_processor()` override can return `None` to signal that the wrapped logits processor should not be applied to the request in question.
+
+Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) which wraps your request level logits processor, you can pass the custom subclass to vLLM via any of the methods described in the following section.
+
+## Ways to Load Your Custom Logits Processor in vLLM
+
+Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
+
+This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
+
+### Method 1: Pass the Custom Logits Processor Fully-Qualified Class Name (FQCN) to vLLM at Initialization Time
+
+This method is supported in both offline and online vLLM usage scenarios. The custom logits processor's FQCN (in the form of `dotted.path.to.module:ClassName`) can be passed as an argument to the `LLM` and `AsyncLLM` Python constructors, or as a CLI argument to `vllm serve` with the following syntax
+
+``` bash
+vllm serve ... --logits_processors <logits processor 1> <logits processor 2> ...
+```
+
+The only requirements on the FQCN are
+
+1. Python's `importlib.import_module()` must be able to resolve the dotted path portion of the FQCN and load it as a module
+
+2. The class-name portion of the FQCN must be possible to import from the loaded module
+
+3. The object pointed to by the FQCN must be a subclass of `LogitsProcessor`
+
+See examples below:
+
+??? code "Passing custom logits processor FQCN to `LLM` in Python"
+
+    ``` python
+    # Pass in FQCN
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=["your.module.path:DummyLogitsProcessor"],
+    )
+    ```
+
+??? code "Passing custom logits processor FQCN to `AsyncLLM` in Python"
+
+    ``` python
+    # Pass in FQCN
+    engine_args = AsyncEngineArgs(model="facebook/opt-125m",
+                                  logits_processors=["your.module.path:DummyLogitsProcessor"])
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+    ```
+
+??? code "Passing custom logits processor FQCN to vLLM server via CLI"
+
+    ```bash
+    vllm serve facebook/opt-125m --logits_processors your.module.path:DummyLogitsProcessor
+    ```
+
+### Method 2: Automatically Detect Custom Logits Processors Installed in Your Python Environment As Entry Points
+
+[`setuptools`](https://setuptools.pypa.io/en/latest/userguide/entry_point.html) can enable installed packages to make themselves available as plugins to other Python programs, via pieces of metadata known as "entry points".
+
+During initialization, vLLM automatically scans the `vllm.logits_processors` entry point group and loads any installed logits processors which it finds.
+
+Suppose that you have developed a Python package that holds your custom logits processors. You can expose each logits processor to vLLM by adding a unique entrypoint for each logits processor to your logits processor Python package. The example below shows how to add an entrypoint to your project's `pyproject.toml` file:
+
+??? code "Exposing a custom logits processor as a Python entrypoint"
+
+    ``` toml
+    [project.entry-points."vllm.logits_processors"]
+    dummy_logits_processor = "your.module.path:DummyLogitsProcessor"
+    ```
+
+Once your package is installed, your custom logits processor will be loaded automatically whenever vLLM is initialized. You do *not* need to pass the custom logits processor to the `LLM` or `AsyncLLM` constructors or to the vLLM server explicitly at initialization time if your logits processor is exposed as an entry point.
+
+!!! note
+    vLLM will *always* load *all* logits processors which are exposed via entrypoints under the `vllm.logits_processors` grouping.
+
+### Method 3 (Offline-only): Pass a Python Class Object to the vLLM Constructor
+
+You can pass one or more custom logits processor class objects to the `LLM` and `AsyncLLM` constructors. This option is very flexible, as the logits processor classes may either be (1) defined locally within the same Python source file where `LLM` or `AsyncLLM` is instantiated, or (2) imported from a Python package.
+
+??? code "Passing custom logits processor class object to `LLM` or `AsyncLLM` in Python"
+
+    ``` python
+    # Import custom logits processor
+    from some.module import DummyLogitsProcessor
+
+    # ...or...
+
+    # Define custom logits processor locally
+    from vllm.v1.sample.logits_processor import LogitsProcessor
+
+    class DummyLogitsProcessor(LogitsProcessor):
+        # See DummyLogitsProcessor implementation above
+        ...
+
+    # Pass class object to LLM constructor
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[DummyLogitsProcessor],
+    )
+
+    # Pass class object to AsyncLLM constructor
+    engine_args = AsyncEngineArgs(model="facebook/opt-125m",
+                                  logits_processors=[DummyLogitsProcessor])
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+    ```
+
+## Invoking a Custom Logits Processor Against a Request
+
+The design of the custom logits processor determines whether the logits processor must be enabled/disabled for a given request, and what arguments must be provided to configure the logits processor.
+
+The examples below show how a user would pass a custom argument (`target_token`) to `DummyLogitsProcessor` in order to (1) enable the logits processor for that particular request and (2) control the logits processor's behavior.
+
+??? code "vLLM REST API: configure custom logits processor for a request"
+
+    ``` bash
+    curl http://localhost:8000/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "Qwen/Qwen2.5-1.5B-Instruct",
+            ...
+            "vllm_xargs": {"target_token": 67}
+        }'
+    ```
+
+??? code "OpenAI SDK: configure custom logits processor for a request"
+
+    ``` python
+    batch = await client.completions.create(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        ...,
+        extra_body={
+            "vllm_xargs": {
+                "target_token": 67
+            }
+        }
+    )
+    ```
+
+??? code "Offline: configure custom logits processor for an `LLM` request"
+
+    ``` python
+    outputs_logitproc = llm.generate("your prompt", 
+                                     SamplingParams(...,
+                                        extra_args={"target_token": 67}))
+    ```
+
+??? code "Offline: configure custom logits processor for an `AsyncLLM` request"
+
+    ``` python
+    async for out in engine.generate(request_id="your request id",
+                                     prompt="your prompt",
+                                     sampling_params=SamplingParams(...,
+                                        extra_args={"target_token": 67})):
+
+        # Process async request outputs
+        ...
+    ```
+
+## Best Practices for Writing Custom Logits Processors
+
+Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
+
+* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
+    * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
+    * However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor
+    * **Note:** wrapped request-level logits processors do not need to implement `apply()` and `update_state()`; the default `AdapterLogitsProcessor.update_state()` implementation maintains a sparse representation of request state, wherein requests for which `new_req_logits_processor()` returns `None` are not represented in the base-class state dictionary. The default implementation of `AdapterLogitsProcessor.apply()` applies the request-level logits processor to each row of input logits sequentially and assembles the output logits tensor. If the performance of this `AdapterLogitsProcessor` default implementation is insufficient, then avoid wrapping your request-level logits processor and instead re-implement it as a `LogitsProcessor` subclass with optimized `apply()` and `update_state()` implementations that operate at batch granularity
+
+* It is up to the logits processor author to determine:
+
+    1. **The per-request attributes which configure the logits processor's behavior against that request.** Your custom logits processor's `update_state()` override determines how `SamplingParams` fields are mapped into logits processor state
+
+        * **Note:** for wrapped request-level logits processors, `new_req_logits_processor()` determines how `SamplingParams` fields are used to initialize a request-level logits processor instance.
+
+    2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor
+
+        * **Note:** for wrapped per-request logits processors, the default `AdapterLogitsProcessor.update_state()` implementation ensures that the request-level logits processor is disabled when `new_req_logits_processor()` returns `None` for that request
+
+    3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor
+
+        * Additionally, an easy way to save compute in `update_state()` is to exit early when the `batch_update` is `None`
+
+        * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class implements the above optimizations by default
+
+* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove)
+
+    * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
+
+* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -45,6 +45,32 @@ When using multi-modal inputs, vLLM normally hashes each media item by content t
        print(o.outputs[0].text)
    ```

+Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from PIL import Image
+
+    # Qwen2.5-VL example with two images
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
+
+    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
+    img_b = Image.open("/path/to/b.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": [None, img_b]},
+        # Since img_a is expected to be cached, we can skip sending the actual
+        # image entirely.
+        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
+    })
+
+    for o in outputs:
+        print(o.outputs[0].text)
+    ```
+
 !!! warning
    If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.

@ -755,6 +781,39 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
    )
    ```

+For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this:
+
+    ```python
+        # Image/video/audio URL:
+        {
+            "type": "image_url",
+            "image_url": None,
+            "uuid": image_uuid,
+        },
+
+        # image_embeds
+        {
+            "type": "image_embeds",
+            "image_embeds": None,
+            "uuid": image_uuid
+        },
+
+        # input_audio:
+        {
+            "type": "input_audio",
+            "input_audio": None,
+            "uuid": audio_uuid
+        },
+
+        # PIL Image:
+        {
+            "type": "image_pil",
+            "image_pil": None
+            "uuid": image_uuid
+        }
+
+    ```
+
 !!! note
    Only one message can contain `{"type": "image_embeds"}`.
    If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@ -43,19 +43,19 @@ th:not(:first-child) {
 }
 </style>

-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
-| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
-| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌          | ❌           |
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | Google TPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌           |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌           |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎           |
+| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌           |
+| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
+| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌           |
+| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌           |

 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - ✅︎ indicates that the quantization method is supported on the specified hardware.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -15,6 +15,7 @@ vLLM currently supports the following reasoning models:
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
 | [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
+| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ |

 !!! note
    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -311,6 +311,15 @@ Flags:
 * For non-reasoning: `--tool-call-parser hunyuan_a13b`
 * For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`

+### GLM-4.5 Models (`glm45`)
+
+Supported models:
+
+* `ZhipuAI/GLM-4.5`
+* `ZhipuAI/GLM-4.5-Air`
+
+Flags: `--tool-call-parser glm45`
+
 ### Models with Pythonic Tool Calls (`pythonic`)

 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
--- a/docs/getting_started/installation/.nav.yml
+++ b/docs/getting_started/installation/.nav.yml
@ -3,5 +3,3 @@ nav:
  - gpu.md
  - cpu.md
  - google_tpu.md
-  - intel_gaudi.md
-  - aws_neuron.md
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@ -12,7 +12,6 @@ vLLM supports the following hardware platforms:
    - [Apple silicon](cpu.md#apple-silicon)
    - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
 - [Google TPU](google_tpu.md)
- [AWS Neuron](aws_neuron.md)

 ## Hardware Plugins

--- a/docs/getting_started/installation/aws_neuron.md
+++ b/docs/getting_started/installation/aws_neuron.md
@ -1,147 +0,0 @@
-# AWS Neuron
-
-[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and
-generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2,
-and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores.
-This describes how to set up your environment to run vLLM on Neuron.
-
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
-## Requirements
-
- OS: Linux
- Python: 3.9 or newer
- Pytorch 2.5/2.6
- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips)
- AWS Neuron SDK 2.23
-
-## Configure a new environment
-
-### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies
-
-The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this
-[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image).
-
- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
- Once inside your instance, activate the pre-installed virtual environment for inference by running
-
-```bash
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-```
-
-Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html)
-for alternative setup instructions including using Docker and manually installing dependencies.
-
-!!! note
-    NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx)
-    library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html).
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built Neuron wheels.
-
-### Build wheel from source
-
-To build and install vLLM from source, run:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -U -r requirements/neuron.txt
-VLLM_TARGET_DEVICE="neuron" pip install -e .
-```
-
-AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at
-<https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2>, which contains several features in addition to what's
-available on vLLM V0. Please utilize the AWS Fork for the following features:
-
- Llama-3.2 multi-modal support
- Multi-node distributed inference
-
-Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html)
-    for more details and usage examples.
-
-To install the AWS Neuron fork, run the following:
-
-```bash
-git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
-cd upstreaming-to-vllm
-pip install -r requirements/neuron.txt
-VLLM_TARGET_DEVICE="neuron" pip install -e .
-```
-
-Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested.
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built Neuron images.
-
-### Build image from source
-
-See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
-
-Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
-
-## Extra information
-
-[](){ #feature-support-through-nxd-inference-backend }
-
-### Feature support through NxD Inference backend
-
-The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend
-to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most
-[features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration.
-
-To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override
-as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include
-
-```python
-override_neuron_config={
-    "enable_bucketing":False,
-}
-```
-
-or when launching vLLM from the CLI, pass
-
-```bash
--override-neuron-config "{\"enable_bucketing\":false}"
-```
-
-Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts
-(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads.
-
-### Known limitations
-
- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this
-  [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility)
-  for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI.
- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this
-  [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html)
-  to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM.
- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at
-  runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py)
- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed
-  to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature.
- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer
-  to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node)
-  to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main.
- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches
-  max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt
-  to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support
-  for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is
-  implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic.
-
-### Environment variables
-
- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
-  compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
-  artifacts under `neuron-compiled-artifacts/{unique_hash}/` subdirectory in the model path. If this environment variable is set,
-  but the directory does not exist, or the contents are invalid, Neuron will also fall back to a new compilation and store the artifacts
-  under this specified path.
- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
--- a/docs/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu/apple.inc.md
@ -52,6 +52,24 @@ uv pip install -e .
        1 error generated.
    ```

+    ---
+
+    If the build fails with C++11/C++17 compatibility errors like the following, the issue is that the build system is defaulting to an older C++ standard:
+
+    ```text
+    [...] error: 'constexpr' is not a type
+    [...] error: expected ';' before 'constexpr'
+    [...] error: 'constexpr' does not name a type
+    ```
+
+    **Solution**: Your compiler might be using an older C++ standard. Edit `cmake/cpu_extension.cmake` and add `set(CMAKE_CXX_STANDARD 17)` before `set(CMAKE_CXX_STANDARD_REQUIRED ON)`.
+
+    To check your compiler's C++ standard support:
+    ```bash
+    clang++ -std=c++17 -pedantic -dM -E -x c++ /dev/null | grep __cplusplus
+    ```
+    On Apple Clang 16 you should see: `#define __cplusplus 201703L`
+
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@ -168,6 +168,7 @@ There are scenarios where the PyTorch dependency cannot be easily installed with
 To build vLLM using an existing PyTorch installation:

 ```bash
+# install PyTorch first, either from PyPI or from source
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 python use_existing_torch.py
@ -175,6 +176,17 @@ uv pip install -r requirements/build.txt
 uv pip install --no-build-isolation -e .
 ```

+Alternatively: if you are exclusively using `uv` to create and manage virtual environments, it has [a unique mechanism](https://docs.astral.sh/uv/concepts/projects/config/#disabling-build-isolation)
+for disabling build isolation for specific packages. vLLM can leverage this mechanism to specify `torch` as the package to disable build isolation for:
+
+```bash
+# install PyTorch first, either from PyPI or from source
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+# pip install -e . does not work directly, only uv can do this
+uv pip install -e .
+```
+
 ##### Use the local cutlass for compilation

 Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@ -1,6 +1,6 @@
 # --8<-- [start:installation]

-vLLM supports AMD GPUs with ROCm 6.3.
+vLLM supports AMD GPUs with ROCm 6.3 or above.

 !!! tip
    [Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
@ -11,8 +11,9 @@ vLLM supports AMD GPUs with ROCm 6.3.
 # --8<-- [end:installation]
 # --8<-- [start:requirements]

- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
- ROCm 6.3
+- GPU: MI200s (gfx90a), MI300 (gfx942), MI350 (gfx950), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
+- ROCm 6.3 or above
+    - MI350 requires ROCm 7.0 or above

 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
@ -32,35 +33,35 @@ Currently, there are no pre-built ROCm wheels.
    - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
    - [PyTorch](https://pytorch.org/)

-    For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3.
+    For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.4.3_ubuntu24.04_py3.12_pytorch_release_2.6.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3.

    Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example:

    ```bash
    # Install PyTorch
    pip uninstall torch -y
-    pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
+    pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4
    ```

-1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
+1. Install [Triton for ROCm](https://github.com/triton-lang/triton)

-    Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
+    Install ROCm's Triton (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)

    ```bash
    python3 -m pip install ninja cmake wheel pybind11
    pip uninstall -y triton
-    git clone https://github.com/OpenAI/triton.git
+    git clone https://github.com/triton-lang/triton.git
    cd triton
    git checkout e5be006
-    cd python
-    pip3 install .
+    if [ ! -f setup.py ]; then cd python; fi
+    python3 setup.py install
    cd ../..
    ```

    !!! note
        If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.

-2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention)
+2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/Dao-AILab/flash-attention)

    Install ROCm's flash attention (v2.7.2) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention#amd-rocm-support)
    Alternatively, wheels intended for vLLM use can be accessed under the releases.
@ -68,9 +69,9 @@ Currently, there are no pre-built ROCm wheels.
    For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.

    ```bash
-    git clone https://github.com/ROCm/flash-attention.git
+    git clone https://github.com/Dao-AILab/flash-attention.git
    cd flash-attention
-    git checkout b7d29fb
+    git checkout 1a7f4dfa
    git submodule update --init
    GPU_ARCHS="gfx90a" python3 setup.py install
    cd ..
@ -194,16 +195,6 @@ To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
 DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
 ```

-To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
-
-```bash
-DOCKER_BUILDKIT=1 docker build \
-    --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \
-    -f docker/Dockerfile.rocm \
-    -t vllm-rocm \
-    .
-```
-
 To run the above docker image `vllm-rocm`, use the below command:

 ??? console "Command"
@ -218,8 +209,7 @@ To run the above docker image `vllm-rocm`, use the below command:
    --device /dev/kfd \
    --device /dev/dri \
    -v <path/to/model>:/app/model \
-    vllm-rocm \
-    bash
+    vllm-rocm
    ```

 Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@ -114,6 +114,33 @@ class Example:
                return match.group('title')
        return fix_case(self.path.stem.replace("_", " ").title())

+    def fix_relative_links(self, content: str) -> str:
+        """
+        Fix relative links in markdown content by converting them to gh-file
+        format.
+        
+        Args:
+            content (str): The markdown content to process
+            
+        Returns:
+            str: Content with relative links converted to gh-file format
+        """
+        # Regex to match markdown links [text](relative_path)
+        # This matches links that don't start with http, https, ftp, or #
+        link_pattern = r'\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)'
+
+        def replace_link(match):
+            link_text = match.group(1)
+            relative_path = match.group(2)
+
+            # Make relative to repo root
+            gh_file = (self.main_file.parent / relative_path).resolve()
+            gh_file = gh_file.relative_to(ROOT_DIR)
+
+            return f'[{link_text}](gh-file:{gh_file})'
+
+        return re.sub(link_pattern, replace_link, content)
+
    def generate(self) -> str:
        content = f"# {self.title}\n\n"
        content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
@ -121,14 +148,16 @@ class Example:
        # Use long code fence to avoid issues with
        # included files containing code fences too
        code_fence = "``````"
-        # Skip the title from md snippets as it's been included above
-        start_line = 2
+
        if self.is_code:
-            content += f"{code_fence}{self.main_file.suffix[1:]}\n"
-            start_line = 1
-        content += f'--8<-- "{self.main_file}:{start_line}"\n'
-        if self.is_code:
-            content += f"{code_fence}\n"
+            content += (f"{code_fence}{self.main_file.suffix[1:]}\n"
+                        f'--8<-- "{self.main_file}"\n'
+                        f"{code_fence}\n")
+        else:
+            with open(self.main_file) as f:
+                # Skip the title from md snippets as it's been included above
+                main_content = f.readlines()[1:]
+            content += self.fix_relative_links("".join(main_content))
        content += "\n"

        if not self.other_files:
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@ -228,7 +228,7 @@ outputs = llm.embed(["Follow the white rabbit."],
 print(outputs[0].outputs)
 ```

-A code example can be found here: <gh-file:examples/offline_inference/embed_matryoshka_fy.py>
+A code example can be found here: <gh-file:examples/offline_inference/pooling/embed_matryoshka_fy.py>

 ### Online Inference

@ -258,4 +258,4 @@ Expected output:
 {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```

-An OpenAI client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
+An OpenAI client example can be found here: <gh-file:examples/online_serving/pooling/openai_embedding_matryoshka_fy.py>
--- a/Show More
+++ b/Show More