updated

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
updated
2025-03-24 01:00:20 +00:00 · 2025-03-24 00:46:46 +00:00 · 2025-03-23 23:03:42 +00:00 · 2025-03-23 22:56:53 +00:00 · 2025-03-23 22:50:20 +00:00 · 2025-03-23 22:44:36 +00:00
1045 changed files with 25686 additions and 79000 deletions
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@ -1,4 +1,3 @@
-# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@ -1,4 +1,3 @@
-# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
-model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.30
-  - name: "exact_match,flexible-extract"
-    value: 0.465
-limit: 1319
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+Minitron-4B-Base-FP8.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -16,7 +16,7 @@ import numpy
 import pytest
 import yaml

-RTOL = 0.08
+RTOL = 0.05
 TEST_DATA_FILE = os.environ.get(
    "LM_EVAL_TEST_DATA_FILE",
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -10,24 +10,15 @@ set -x
 set -o pipefail

 check_gpus() {
-  if command -v nvidia-smi; then
-    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
-  fi
-
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  if command -v nvidia-smi; then
-    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
-  elif command -v amd-smi; then
-    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
-  fi
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }

@ -99,15 +90,9 @@ kill_gpu_processes() {


  # wait until GPU memory usage smaller than 1GB
-  if command -v nvidia-smi; then
-    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-      sleep 1
-    done
-  elif command -v amd-smi; then
-    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
-      sleep 1
-    done
-  fi
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done

  # remove vllm config file
  rm -rf ~/.config/vllm
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -63,12 +63,10 @@
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "speculative_config": {
-                "model": "turboderp/Qwama-0.5B-Instruct",
-                "num_speculative_tokens": 4,
-                "draft_tensor_parallel_size": 1
-            }
+            "swap_space": 16, 
+            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
+            "num_speculative_tokens": 4,
+            "speculative_draft_tensor_parallel_size": 1
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -3,10 +3,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -14,10 +14,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -31,10 +31,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -48,7 +48,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Build and publish TPU release image"
@ -57,7 +57,7 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
@ -82,22 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
-
-  - block: "Build Neuron release image"
-    key: block-neuron-release-image-build
-    depends_on: ~
-
-  - label: "Build and publish Neuron release image"
-    depends_on: block-neuron-release-image-build
-    agents:
-      queue: neuron-postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
-    env:
-      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -98,13 +98,6 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_machete_mm.py \
  --ignore=kernels/test_mha_attn.py \
  --ignore=kernels/test_block_fp8.py \
-  --ignore=kernels/test_cutlass_moe.py \
-  --ignore=kernels/test_mamba_ssm_ssd.py \
-  --ignore=kernels/test_attention.py \
-  --ignore=kernels/test_block_int8.py \
-  --ignore=kernels/test_fused_quant_layernorm.py \
-  --ignore=kernels/test_int8_kernel.py \
-  --ignore=kernels/test_triton_moe_ptpc_fp8.py \
  --ignore=kernels/test_permute_cols.py"
 fi

@ -112,33 +105,19 @@ fi
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_chat.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
-  --ignore=entrypoints/openai/test_lora_adapters.py \
-  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-  --ignore=entrypoints/openai/test_root_path.py \
-  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi

 #ignore certain Entrypoints/llm tests
-if [[ $commands == *" entrypoints/llm "* ]]; then
-  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
-  --ignore=entrypoints/llm/test_chat.py \
-  --ignore=entrypoints/llm/test_accuracy.py \
-  --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
-  --ignore=entrypoints/llm/test_prompt_validation.py "}
+if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 fi

-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
@ -155,10 +134,9 @@ if [[ $commands == *"--shard-id="* ]]; then
    # assign shard-id for each shard
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    echo "Shard ${GPU} commands:$commands_gpu"
-    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
    docker run \
-        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-        --network=host \
+        --device /dev/kfd --device /dev/dri \
+        --network host \
        --shm-size=16gb \
        --rm \
        -e HIP_VISIBLE_DEVICES="${GPU}" \
@ -185,10 +163,9 @@ if [[ $commands == *"--shard-id="* ]]; then
    fi
  done
 else
-  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-          --network=host \
+          --device /dev/kfd --device /dev/dri \
+          --network host \
          --shm-size=16gb \
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -5,8 +5,8 @@
 set -ex
 set -o pipefail

-# cd 2 levels into the working directory
-cd "$(dirname "${BASH_SOURCE[0]}")/../.."
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."

 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

--- a/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-docker build -t cpu-test -f docker/Dockerfile.s390x .
+docker build -t cpu-test -f Dockerfile.ppc64le .
+
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -8,19 +8,15 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}

+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
+
 # Setup cleanup
-remove_docker_container() { 
-    set -e; 
-    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
-    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
-}
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
-
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@ -40,8 +36,8 @@ function cpu_tests() {
  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
+    pip install -r vllm/requirements/test.txt
+    pip install -r vllm/requirements/cpu.txt
    pytest -v -s tests/models/decoder_only/language -m cpu_model
    pytest -v -s tests/models/embedding/language -m cpu_model
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@ -9,13 +9,11 @@ python3 use_existing_torch.py

 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
-  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
-  --build-arg RUN_WHEEL_CHECK=false \
  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"

@ -25,6 +23,6 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and test offline inference
-docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -5,7 +5,7 @@
 set -ex

 # Try building the docker image
-docker build -t hpu-test-env -f docker/Dockerfile.hpu .
+docker build -t hpu-test-env -f Dockerfile.hpu .

 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@ -3,7 +3,7 @@
 set -euox pipefail

 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi

--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -35,7 +35,7 @@ else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi

-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .

 # Setup cleanup
 remove_docker_container() {
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -1,9 +1,9 @@
 #!/bin/bash

-set -xue
+set -e

 # Build the docker image.
-docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+docker build -f Dockerfile.tpu -t vllm-tpu .

 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
@ -17,17 +17,12 @@ source /etc/environment
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest pytest-asyncio tpu-info \
+    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && export VLLM_XLA_CACHE_PATH= \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
-    && echo HARDWARE \
-    && tpu-info \
-    && echo TEST_0 \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
    && echo TEST_1 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
    && echo TEST_2 \
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
    && echo TEST_3 \
@ -35,20 +30,9 @@ docker run --privileged --net host --shm-size=16G -it \
    && echo TEST_4 \
    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
    && echo TEST_5 \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
-    && echo TEST_6 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
-    && echo TEST_7 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
-    && echo TEST_8 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
-    && echo TEST_9 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
-    && echo TEST_10 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
-    && echo TEST_11 \
-    && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py" \


 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t ${image_name} -f docker/Dockerfile.xpu .
+docker build -t ${image_name} -f Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() { 
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -1,45 +0,0 @@
-#!/bin/bash
-
-# This script build the CPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Setup cleanup
-remove_docker_container() {
-  if [[ -n "$container_id" ]]; then
-      podman rm -f "$container_id" || true
-  fi
-  podman system prune -f
-}
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Try building the docker image
-podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
-
-# Run the image
-container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
-
-function cpu_tests() {
-
-  # offline inference
-  podman exec -it "$container_id" bash -c "
-    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
-
-  # Run basic model test
-  podman exec -it "$container_id" bash -c "
-    set -e
-    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
-    pip install sentence-transformers datamodel_code_generator
-    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
-}
-
-# All of CPU tests are expected to be finished less than 40 mins.
-
-export container_id
-export -f cpu_tests
-timeout 40m bash -c cpu_tests
-
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -8,7 +8,6 @@
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
@ -71,7 +70,6 @@ steps:
 - label: Basic Correctness Test # 30min
  #mirror_hardwares: [amd]
  fast_check: true
-  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
@ -106,8 +104,7 @@ steps:
 - label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
-  torch_nightly: true
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
@ -121,7 +118,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

@ -138,14 +135,12 @@ steps:
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/test_async_llm_dp.py
  commands:
  # test with tp=2 and external_dp=2
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
@ -158,7 +153,6 @@ steps:
  - popd

 - label: Metrics, Tracing Test # 10min
-  mirror_hardwares: [amd]
  num_gpus: 2
  source_file_dependencies:
  - vllm/
@ -166,13 +160,18 @@ steps:
  - tests/tracing
  commands:
  - pytest -v -s metrics
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0,<1.27.0' \
+      'opentelemetry-api>=1.26.0,<1.27.0' \
+      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
  - pytest -v -s tracing

 ##### fast check tests  #####
 #####  1 GPU test  #####

 - label: Regression Test # 5min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@ -203,13 +202,12 @@ steps:
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
+    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/engine
    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/sample
    - pytest -v -s v1/worker
    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/spec_decode
-    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_stats.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
@ -285,22 +283,13 @@ steps:
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py

 - label: LoRA Test %N # 15min each
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
  parallelism: 4

- label: PyTorch Compilation Unit Tests
-  source_file_dependencies:
-    - vllm/
-    - tests/compile
-  commands:
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_sequence_parallelism.py
-
 - label: PyTorch Fullgraph Smoke Test # 9min
  source_file_dependencies:
  - vllm/
@ -310,6 +299,7 @@ steps:
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
+  - pytest -v -s compile/test_pass_manager.py

 - label: PyTorch Fullgraph Test # 18min
  source_file_dependencies:
@ -318,49 +308,18 @@ steps:
  commands:
  - pytest -v -s compile/test_full_graph.py

- label: Kernels Core Operation Test
+- label: Kernels Test %N # 1h each
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
-  - tests/kernels/core
-  commands:
-    - pytest -v -s kernels/core
-
- label: Kernels Attention Test %N
-  source_file_dependencies:
-  - csrc/attention/
  - vllm/attention
-  - vllm/v1/attention
-  - tests/kernels/attention
+  - tests/kernels
  commands:
-    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
-
- label: Kernels Quantization Test %N
-  source_file_dependencies:
-  - csrc/quantization/
-  - vllm/model_executor/layers/quantization
-  - tests/kernels/quantization
-  commands:
-    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
-
- label: Kernels MoE Test
-  source_file_dependencies:
-  - csrc/moe/
-  - tests/kernels/moe
-  - vllm/model_executor/layers/fused_moe/
-  commands:
-    - pytest -v -s kernels/moe
-
- label: Kernels Mamba Test
-  source_file_dependencies:
-  - csrc/mamba/
-  - tests/kernels/mamba
-  commands:
-    - pytest -v -s kernels/mamba
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4

 - label: Tensorizer Test # 11min
-  # mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
@ -376,14 +335,7 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
-  - bash scripts/run-benchmarks.sh
-
- label: Benchmarks CLI Test # 10min
-  source_file_dependencies:
-  - vllm/
-  - tests/benchmarks/
-  commands:
-  - pytest -v -s benchmarks/
+  - bash run-benchmarks.sh

 - label: Quantization Test # 33min
  source_file_dependencies:
@ -418,14 +370,12 @@ steps:

 - label: OpenAI-Compatible Tool Use # 20 min
  fast_check: false
-  #mirror_hardwares: [ amd ]
+  mirror_hardwares: [ amd ]
  source_file_dependencies:
    - vllm/
    - tests/tool_use
-    - tests/mistral_tool_use
  commands:
    - pytest -v -s tool_use
-    - pytest -v -s mistral_tool_use

 #####  models test  #####

@ -437,9 +387,7 @@ steps:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py

 - label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
@ -449,8 +397,6 @@ steps:
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/language -m core_model

@ -462,8 +408,6 @@ steps:
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/language -m 'not core_model'

@ -480,12 +424,11 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal
    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/vision_language -m core_model
    - pytest -v -s models/encoder_decoder/audio_language -m core_model
    - pytest -v -s models/encoder_decoder/language -m core_model
    - pytest -v -s models/encoder_decoder/vision_language -m core_model
-    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py

 - label: Multi-Modal Models Test (Extended) 1 # 48m
  optional: true
@ -499,7 +442,10 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    # HACK - run phi3v tests separately to sidestep this transformers bug
+    # https://github.com/huggingface/transformers/issues/34307
+    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/vision_language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
@ -515,7 +461,6 @@ steps:

 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
-  mirror_hardwares: [amd]
  optional: true
  commands:
    - echo 'Testing custom models...'
@ -527,7 +472,6 @@ steps:
 #####  multi gpus test  #####

 - label: Distributed Comm Ops Test # 7min
-  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -570,10 +514,7 @@ steps:
  - vllm/worker/worker.py
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
-  - tests/v1/test_async_llm_dp.py
-  - vllm/v1/engine/
  commands:
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
@ -584,14 +525,11 @@ steps:
  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
-  # test sequence parallel
-  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown

 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
@ -654,10 +592,14 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # This test runs llama 13B, so it is required to run on 4 GPUs.
+    - pytest -v -s -x lora/test_long_context.py
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py
+    - pytest -v -s -x lora/test_transfomers_model.py


 - label: Weight Loading Multiple GPU Test  # 33min
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -12,7 +12,6 @@
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
 /vllm/model_executor/guided_decoding @mgoin @russellb
 /vllm/multimodal @DarkLight1337 @ywang96
-/vllm/vllm_flash_attn @LucasWilkinson
 CMakeLists.txt @tlrmchlsmth

 # vLLM V1
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).

-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@ -35,7 +35,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)

 <!--- pyml disable-next-line no-emphasis-as-heading -->
-**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -19,7 +19,7 @@ pull_request_rules:
      - files~=\.buildkite/
      - files~=^cmake/
      - files=CMakeLists.txt
-      - files~=^docker/Dockerfile
+      - files~=^Dockerfile
      - files~=^requirements.*\.txt
      - files=setup.py
  actions:
@ -55,19 +55,11 @@ pull_request_rules:
  description: Automatically apply structured-output label
  conditions:
    - or:
-      - files~=^benchmarks/structured_schemas/
-      - files=benchmarks/benchmark_serving_structured_output.py
-      - files=benchmarks/run_structured_output_benchmark.sh
-      - files=docs/source/features/structured_outputs.md
-      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^vllm/model_executor/guided_decoding/
      - files=tests/model_executor/test_guided_processors.py
      - files=tests/entrypoints/llm/test_guided_generate.py
-      - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_guided_generate.py
-      - files~=^vllm/v1/structured_output/
+      - files=benchmarks/benchmark_serving_guided.py
+      - files=benchmarks/benchmark_guided.py
  actions:
    label:
      add:
@ -96,58 +88,6 @@ pull_request_rules:
      add:
        - v1

- name: label-tpu
-  description: Automatically apply tpu label
-  # Keep this list in sync with `label-tpu-remove` conditions
-  conditions:
-    - or:
-      - files~=tpu.py
-      - files~=_tpu
-      - files~=tpu_
-      - files~=/tpu/
-      - files~=pallas
-  actions:
-    label:
-      add:
-        - tpu
-
- name: label-tpu-remove
-  description: Automatically remove tpu label
-  # Keep this list in sync with `label-tpu` conditions
-  conditions:
-    - and:
-      - -files~=tpu.py
-      - -files~=_tpu
-      - -files~=tpu_
-      - -files~=/tpu/
-      - -files~=pallas
-  actions:
-    label:
-      remove:
-        - tpu
-
- name: label-tool-calling
-  description: Automatically add tool-calling label
-  conditions:
-    - or:
-      - files~=^tests/tool_use/
-      - files~=^tests/mistral_tool_use/
-      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
-      - files~=^vllm/entrypoints/openai/tool_parsers/
-      - files=docs/source/features/tool_calling.md
-      - files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
-      - files=docs/source/getting_started/examples/chat_with_tools.md
-      - files~=^examples/tool_chat_*
-      - files=examples/offline_inference/chat_with_tools.py
-      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
-      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
-      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
-  actions:
-    label:
-      add:
-        - tool-calling
-
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
      - conflict
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -50,7 +50,7 @@ jobs:
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0

      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .

      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,7 @@
 /vllm/_version.py

 # vllm-flash-attn built from source
-vllm/vllm_flash_attn/*
+vllm/vllm_flash_attn/

 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -202,6 +202,3 @@ benchmarks/**/*.json
 # Linting
 actionlint
 shellcheck*/
-
-# Ingore moe/marlin_moe gen code
-csrc/moe/marlin_moe_wna16/kernel_*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,6 +1,3 @@
-default_install_hook_types:
-  - pre-commit
-  - commit-msg
 default_stages:
  - pre-commit # Run locally
  - manual # Run in CI
@ -11,6 +8,7 @@ repos:
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
+    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.9.3
  hooks:
@ -121,12 +119,6 @@ repos:
    language: system
    always_run: true
    pass_filenames: false
-  - id: update-dockerfile-graph
-    name: Update Dockerfile dependency graph
-    entry: tools/update-dockerfile-graph.sh
-    language: script
-    files: ^docker/Dockerfile$
-    pass_filenames: false
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from docker/Dockerfile.rocm
+# versions are derived from Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
@ -230,12 +230,10 @@ set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
-  "csrc/attention/merge_attn_states.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
-  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
@ -243,7 +241,6 @@ set(VLLM_EXT_SRC
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
-  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
@ -251,7 +248,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
  # Please keep this in sync with FetchContent_Declare line below.
-  set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -269,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
        # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG v3.9.0
+        GIT_TAG v3.8.0
        GIT_PROGRESS TRUE

        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -285,13 +282,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/custom_all_reduce.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
-    "csrc/cutlass_extensions/common.cpp"
-    "csrc/attention/mla/cutlass_mla_entry.cu")
+    "csrc/cutlass_extensions/common.cpp")

  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -464,52 +461,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(FP4_ARCHS)
  endif()

-  # CUTLASS MLA Archs and flags
-  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
-    set(SRCS
-      "csrc/attention/mla/cutlass_mla_kernels.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${MLA_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
-    # Add MLA-specific include directories only to MLA source files
-    set_source_files_properties(${SRCS}
-      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
-    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
-  else()
-    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
-    # clear MLA_ARCHS
-    set(MLA_ARCHS)
-  endif()
-
-  # CUTLASS MoE kernels
-
-  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
-  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
-  # to compile MoE kernels that use its output.
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
-             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
-    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
-                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
-                     "if you intend on running FP8 quantized MoE models on Hopper.")
-    else()
-      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-  endif()
-
  #
  # Machete kernels

@ -629,51 +580,21 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
+    set(MARLIN_MOE_SRC
+        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
+        "csrc/moe/marlin_moe_ops.cu")

-    #
-    # For the Marlin MOE kernels we automatically generate sources for various
-    # preselected input type pairs and schedules.
-    # Generate sources:
-    set(MOE_MARLIN_GEN_SCRIPT
-      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
-    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
-
-    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
-
-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
-      execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
-        RESULT_VARIABLE moe_marlin_generation_result
-        OUTPUT_VARIABLE moe_marlin_generation_output
-        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
-        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
-      )
-
-      if (NOT moe_marlin_generation_result EQUAL 0)
-        message(FATAL_ERROR "Marlin MOE generation failed."
-                            " Result: \"${moe_marlin_generation_result}\""
-                            "\nCheck the log for details: "
-                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
-      else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
-        message(STATUS "Marlin MOE generation completed successfully.")
-      endif()
-    else()
-      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
-    endif()
-
-    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
    set_gencode_flags_for_srcs(
-      SRCS "${MOE_WNAA16_MARLIN_SRC}"
+      SRCS "${MARLIN_MOE_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")

-    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
-
+    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
@ -698,7 +619,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  set(VLLM_ROCM_EXT_SRC
    "csrc/rocm/torch_bindings.cpp"
-    "csrc/rocm/skinny_gemms.cu"
    "csrc/rocm/attention.cu")

  define_gpu_extension_target(
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -14,22 +14,17 @@ ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive

-# Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+# Install minimal dependencies and uv
+RUN apt-get update -y \
+    && apt-get install -y ccache git curl wget sudo \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Add uv to PATH
+ENV PATH="/root/.local/bin:$PATH"
+# Create venv with specified Python and activate by placing at the front of path
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -51,22 +46,19 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

 WORKDIR /workspace

-# install build and runtime dependencies
-
 # arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
    fi

 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/cuda.txt
+    uv pip install -r requirements/cuda.txt

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -91,7 +83,7 @@ COPY requirements/build.txt requirements/build.txt
 ENV UV_HTTP_TIMEOUT=500

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
+    uv pip install -r requirements/build.txt

 COPY . .
 ARG GIT_REPO_CHECK=0
@ -162,11 +154,8 @@ ENV UV_HTTP_TIMEOUT=500
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
-# Workaround for #17068
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt
+    uv pip install -r requirements/dev.txt
 #################### DEV IMAGE ####################

 #################### vLLM installation IMAGE ####################
@ -182,23 +171,18 @@ ARG TARGETPLATFORM
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

-# Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+# Install minimal dependencies and uv
+RUN apt-get update -y \
+    && apt-get install -y ccache git curl wget sudo vim \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Add uv to PATH
+ENV PATH="/root/.local/bin:$PATH"
+# Create venv with specified Python and activate by placing at the front of path
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -216,14 +200,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
    fi

 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose
+    uv pip install dist/*.whl --verbose

 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@ -238,13 +221,10 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl

 RUN --mount=type=cache,target=/root/.cache/uv \
-. /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
-COPY benchmarks benchmarks
-COPY ./vllm/collect_env.py .

 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
@ -252,7 +232,7 @@ COPY ./vllm/collect_env.py .
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
+    uv pip install -r requirements/build.txt

 #################### vLLM installation IMAGE ####################

@ -268,19 +248,16 @@ ADD . /vllm-workspace/
 ENV UV_HTTP_TIMEOUT=500

 # install development dependencies (for testing)
-# Workaround for #17068
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt
+    uv pip install -r requirements/dev.txt

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -e tests/vllm_test_utils
+    uv pip install -e tests/vllm_test_utils

 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
+    uv pip install hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1

 # Copy in the v1 package for testing (it isn't distributed yet)
@ -297,7 +274,6 @@ RUN mv vllm test_docs/
 #################### OPENAI API SERVER ####################
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
-ARG TARGETPLATFORM

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -306,9 +282,9 @@ ENV UV_HTTP_TIMEOUT=500
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    fi

 ENV VLLM_USAGE_SOURCE production-docker-image
--- a/docker/Dockerfile.arm
+++ b/docker/Dockerfile.arm
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -0,0 +1,69 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-1
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
+# intel-openmp provides additional performance improvement vs. openmp
+# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install intel-openmp==2025.0.1
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+RUN pip install intel_extension_for_pytorch==2.6.0
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements/build.txt
+
+FROM cpu-test-1 AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    pip install -v -r requirements/cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -e tests/vllm_test_utils
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest

 COPY ./ /workspace/vllm

--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"

 FROM $BASE_IMAGE

@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm

 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
-RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
-RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
+RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install pytest

 # uninstall transformers-neuronx package explicitly to avoid version conflict
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -0,0 +1,37 @@
+FROM mambaorg/micromamba
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+USER root
+
+ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
+
+RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
+
+# Some packages in requirements/cpu are installed here
+# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
+# Currently these may not be available for venv or pip directly
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
+RUN --mount=type=cache,target=/root/.cache/pip  \
+    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        -r requirements/cpu.txt \
+        xformers uvloop==0.20.0
+
+RUN --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -12,8 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}

 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
-    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
-    apt-transport-https ca-certificates wget curl
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
 # Remove sccache    
 RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
@ -41,7 +40,7 @@ ARG USE_CYTHON
 RUN cd vllm \
    && python3 -m pip install -r requirements/rocm.txt \
    && python3 setup.py clean --all  \
-    && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
+    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
    && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,18 +1,18 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="db8e93b4"
+ARG HIPBLASLT_BRANCH="4d40e36"
 ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 ARG LEGACY_HIPBLASLT_OPTION=
 ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="295f2ed4"
-ARG PYTORCH_VISION_BRANCH="v0.21.0"
+ARG PYTORCH_BRANCH="3a585126"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="1a7f4dfa"
-ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="7e1ed08"
+ARG FA_BRANCH="b7d29fb"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+ARG AITER_BRANCH="21d47a9"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
 ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}

 ARG PYTHON_VERSION=3.12
@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive

 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
+    && apt-get install -y software-properties-common git curl sudo vim less \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@ -42,7 +42,7 @@ RUN apt-get update -y \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

-RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython

 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@ -60,8 +60,7 @@ RUN cd hipBLAS-common \
 RUN git clone https://github.com/ROCm/hipBLASLt
 RUN cd hipBLASLt \
    && git checkout ${HIPBLASLT_BRANCH} \
-    && apt-get install -y llvm-dev \
-    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
    && cd build/release \
    && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@ -111,24 +110,11 @@ RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
-    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/flash-attention/dist/*.whl /app/install

-FROM base AS build_aiter
-ARG AITER_BRANCH
-ARG AITER_REPO
-RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
-    pip install /install/*.whl
-RUN git clone --recursive ${AITER_REPO}
-RUN cd aiter \
-    && git checkout ${AITER_BRANCH} \
-    && git submodule update --init --recursive \
-    && pip install -r requirements.txt
-RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
-RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
-
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
@ -144,12 +130,19 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
-RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
-    pip install /install/*.whl
+
+ARG AITER_REPO
+ARG AITER_BRANCH
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter 

 ARG BASE_IMAGE
-ARG HIPBLAS_COMMON_BRANCH
 ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
@ -161,8 +154,6 @@ ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
-ARG AITER_BRANCH
-ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@ -176,5 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    cd ../../python && \
    export PYARROW_PARALLEL=4 && \
    export ARROW_BUILD_TYPE=release && \
-    uv pip install -r requirements-build.txt && \
+    uv pip install -r requirements/build.txt && \
    python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel

 FROM python-install AS numa-build
@ -96,22 +96,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
    python setup.py bdist_wheel

-FROM python-install AS hf-xet-builder
-# Install hf-xet
-WORKDIR /tmp
-ENV CARGO_HOME=/root/.cargo
-ENV RUSTUP_HOME=/root/.rustup
-ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
-    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
-    git clone https://github.com/huggingface/xet-core.git && \
-    cd xet-core/hf_xet/ && \
-    uv pip install maturin patchelf && \
-    python -m maturin build --release --out dist && \
-    mkdir -p /tmp/hf-xet/dist && \
-    cp dist/*.whl /tmp/hf-xet/dist/
-
 # Final build stage
 FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
@ -136,15 +120,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
    --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
    --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
-    --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
     sed -i '/^torch/d' requirements/build.txt && \
     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
-     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
    uv pip install -v \    
        $ARROW_WHL_FILE  \
        $VISION_WHL_FILE \
-        $HF_XET_WHL_FILE \
        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
        --index-strategy unsafe-best-match \
        -r requirements/build.txt \
@ -168,5 +149,4 @@ USER 2000
 WORKDIR /home/vllm

 # Set the default entrypoint
-ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
-
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
--- a/README.md
+++ b/README.md
@ -10,40 +10,28 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

 ---

+[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
+
+[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
+
+---
+
 *Latest News* 🔥
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
+
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
-
-<details>
-<summary>Previous News</summary>
-
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
-
-</details>
+- [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!

 ---
+
 ## About

 vLLM is a fast and easy-to-use library for LLM inference and serving.
@ -98,7 +86,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
 ## Contributing

 We welcome and value any contributions and collaborations.
-Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
+Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.

 ## Sponsors

@ -121,7 +109,6 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -41,39 +41,29 @@ become available.
      <td><code>synthetic</code></td>
    </tr>
    <tr>
-      <td><strong>HuggingFace-VisionArena</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>lmarena-ai/VisionArena-Chat</code></td>
+      <td><strong>HuggingFace</strong></td>
+      <td style="text-align: center;">🟡</td>
+      <td style="text-align: center;">🟡</td>
+      <td>Specify your dataset path on HuggingFace</td>
    </tr>
    <tr>
-      <td><strong>HuggingFace-InstructCoder</strong></td>
+      <td><strong>VisionArena</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
-      <td><code>likaixin/InstructCoder</code></td>
-    </tr>
-      <tr>
-      <td><strong>HuggingFace-AIMO</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
-    </tr>
-    <tr>
-      <td><strong>HuggingFace-Other</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
+      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
    </tr>
  </tbody>
 </table>

 ✅: supported

-🟡: Partial support
-
 🚧: to be supported

-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
+If you need support for other dataset formats, please consider contributing.
+
+**Note**: VisionArena’s `dataset-name` should be set to `hf`

 ---
 ## Example - Online Benchmark
@ -81,7 +71,8 @@ become available.
 First start serving your model

 ```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+vllm serve ${MODEL_NAME} --disable-log-requests
 ```

 Then run the benchmarking script
@ -89,13 +80,12 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --endpoint /v1/completions \
-  --dataset-name sharegpt \
-  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --num-prompts 10
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+NUM_PROMPTS=10
+BACKEND="vllm"
+DATASET_NAME="sharegpt"
+DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
 ```

 If successful, you will see the following output
@ -132,105 +122,88 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```

 ```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT='train'
+
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path lmarena-ai/VisionArena-Chat \
-  --hf-split train \
-  --num-prompts 1000
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}"
 ```

-### InstructCoder Benchmark with Speculative Decoding
+### HuggingFaceDataset Examples

-``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
-    --speculative-model "[ngram]" \
-    --ngram_prompt_lookup_min 2 \
-    --ngram-prompt-lookup-max 5 \
-    --num_speculative_tokens 5
-```
-
-``` bash
-python3 benchmarks/benchmark_serving.py \
-    --model meta-llama/Meta-Llama-3-8B-Instruct \
-    --dataset-name hf \
-    --dataset-path likaixin/InstructCoder \
-    --num-prompts 2048
-```
-
-### Other HuggingFaceDataset Examples
+Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
+formats, please consider contributing.

 ```bash
+# need a model with vision capability here
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```

 **`lmms-lab/LLaVA-OneVision-Data`**

 ```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
+DATASET_SPLIT='train'
+DATASET_SUBSET='chart2text(cauldron)'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path lmms-lab/LLaVA-OneVision-Data \
-  --hf-split train \
-  --hf-subset "chart2text(cauldron)" \
-  --num-prompts 10
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-subset "${DATASET_SUBSET}"
 ```

 **`Aeala/ShareGPT_Vicuna_unfiltered`**

 ```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
+DATASET_SPLIT='train'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
-  --hf-split train \
-  --num-prompts 10
-```
-
-**`AI-MO/aimo-validation-aime`**
-
-``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
-    --model Qwen/QwQ-32B \
-    --dataset-name hf \
-    --dataset-path AI-MO/aimo-validation-aime \
-    --num-prompts 10 \
-    --seed 42
-```
-
-### Running With Sampling Parameters
-
-When using OpenAI-compatible backends such as `vllm`, optional sampling
-parameters can be specified. Example client command:
-
-```bash
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --endpoint /v1/completions \
-  --dataset-name sharegpt \
-  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --top-k 10 \
-  --top-p 0.9 \
-  --temperature 0.5 \
-  --num-prompts 10
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
 ```

 ---
 ## Example - Offline Throughput Benchmark

 ```bash
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+NUM_PROMPTS=10
+DATASET_NAME="sonnet"
+DATASET_PATH="vllm/benchmarks/sonnet.txt"
+
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --dataset-name sonnet \
-  --dataset-path vllm/benchmarks/sonnet.txt \
-  --num-prompts 10
+  --model "${MODEL_NAME}" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --num-prompts "${NUM_PROMPTS}"
 ```

 If successful, you will see the following output
@ -244,13 +217,19 @@ Total num output tokens:  1500
 ### VisionArena Benchmark for Vision Language Models

 ``` bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT="train"
+
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path lmarena-ai/VisionArena-Chat \
-  --num-prompts 1000 \
-  --hf-split train
+  --model "${MODEL_NAME}" \
+  --backend "vllm-chat" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-split "${DATASET_SPLIT}"
 ```

 The `num prompt tokens` now includes image token counts
@ -261,83 +240,29 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```

-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_WORKER_MULTIPROC_METHOD=spawn \
-VLLM_USE_V1=1 \
-python3 vllm/benchmarks/benchmark_throughput.py \
-    --dataset-name=hf \
-    --dataset-path=likaixin/InstructCoder \
-    --model=meta-llama/Meta-Llama-3-8B-Instruct \
-    --input-len=1000 \
-    --output-len=100 \
-    --num-prompts=2048 \
-    --async-engine \
-    --speculative-model="[ngram]" \
-    --ngram_prompt_lookup_min=2 \
-    --ngram-prompt-lookup-max=5 \
-    --num_speculative_tokens=5
-```
-
-```
-Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
-Total num prompt tokens:  261136
-Total num output tokens:  204800
-```
-
-### Other HuggingFaceDataset Examples
-
-**`lmms-lab/LLaVA-OneVision-Data`**
-
-```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path lmms-lab/LLaVA-OneVision-Data \
-  --hf-split train \
-  --hf-subset "chart2text(cauldron)" \
-  --num-prompts 10
-```
-
-**`Aeala/ShareGPT_Vicuna_unfiltered`**
-
-```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
-  --hf-split train \
-  --num-prompts 10
-```
-
-**`AI-MO/aimo-validation-aime`**
-
-```bash
-python3 benchmarks/benchmark_throughput.py \
-  --model Qwen/QwQ-32B \
-  --backend vllm \
-  --dataset-name hf \
-  --dataset-path AI-MO/aimo-validation-aime \
-  --hf-split train \
-  --num-prompts 10
-```
-
 ### Benchmark with LoRA Adapters

 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+MODEL_NAME="meta-llama/Llama-2-7b-hf"
+BACKEND="vllm"
+DATASET_NAME="sharegpt"
+DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
+NUM_PROMPTS=10
+MAX_LORAS=2
+MAX_LORA_RANK=8
+ENABLE_LORA="--enable-lora"
+LORA_PATH="yard1/llama-2-7b-sql-lora-test"
+
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model meta-llama/Llama-2-7b-hf \
-  --backend vllm \
-  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --dataset_name sharegpt \
-  --num-prompts 10 \
-  --max-loras 2 \
-  --max-lora-rank 8 \
-  --enable-lora \
-  --lora-path yard1/llama-2-7b-sql-lora-test
+  --model "${MODEL_NAME}" \
+  --backend "${BACKEND}" \
+  --dataset_path "${DATASET_PATH}" \
+  --dataset_name "${DATASET_NAME}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --max-loras "${MAX_LORAS}" \
+  --max-lora-rank "${MAX_LORA_RANK}" \
+  ${ENABLE_LORA} \
+  --lora-path "${LORA_PATH}"
  ```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0

-import io
 import json
 import os
 import sys
@ -33,7 +32,6 @@ class RequestFuncInput:
    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False
-    language: Optional[str] = None


@dataclass
@ -221,15 +219,7 @@ async def async_request_deepspeed_mii(
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
-                    if "choices" in parsed_resp:
-                        output.generated_text = parsed_resp["choices"][0][
-                            "text"]
-                    elif "text" in parsed_resp:
-                        output.generated_text = parsed_resp["text"][0]
-                    else:
-                        output.error = ("Unexpected response format: "
-                                        "neither 'choices' nor 'text' found")
-                        output.success = False
+                    output.generated_text = parsed_resp["text"][0]
                    output.success = True
                else:
                    output.error = response.reason or ""
@ -438,110 +428,6 @@ async def async_request_openai_chat_completions(
    return output


-async def async_request_openai_audio(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    # Lazy import without PlaceholderModule to avoid vllm dep.
-    import soundfile
-    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("transcriptions", "translations"
-         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
-    "or `translations`."
-
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
-        content = [{"type": "text", "text": request_func_input.prompt}]
-        payload = {
-            "model": request_func_input.model_name \
-                if request_func_input.model_name else request_func_input.model,
-            "temperature": 0.0,
-            "max_completion_tokens": request_func_input.output_len,
-            "stream": True,
-            "language": "en",
-            # Flattened due to multipart/form-data
-            "stream_include_usage": True,
-            "stream_continuous_usage_stats": True
-        }
-        if request_func_input.extra_body:
-            payload.update(request_func_input.extra_body)
-        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        }
-
-        # Send audio file
-        def to_bytes(y, sr):
-            buffer = io.BytesIO()
-            soundfile.write(buffer, y, sr, format="WAV")
-            buffer.seek(0)
-            return buffer
-
-        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
-            form = aiohttp.FormData()
-            form.add_field('file', f, content_type='audio/wav')
-            for key, value in payload.items():
-                form.add_field(key, str(value))
-
-            output = RequestFuncOutput()
-            output.prompt_len = request_func_input.prompt_len
-
-            generated_text = ""
-            ttft = 0.0
-            st = time.perf_counter()
-            most_recent_timestamp = st
-            try:
-                async with session.post(url=api_url,
-                                        data=form,
-                                        headers=headers) as response:
-                    if response.status == 200:
-                        async for chunk_bytes in response.content:
-                            chunk_bytes = chunk_bytes.strip()
-                            if not chunk_bytes:
-                                continue
-
-                            chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                "data: ")
-                            if chunk != "[DONE]":
-                                timestamp = time.perf_counter()
-                                data = json.loads(chunk)
-
-                                if choices := data.get("choices"):
-                                    content = choices[0]["delta"].get(
-                                        "content")
-                                    # First token
-                                    if ttft == 0.0:
-                                        ttft = timestamp - st
-                                        output.ttft = ttft
-
-                                    # Decoding phase
-                                    else:
-                                        output.itl.append(
-                                            timestamp - most_recent_timestamp)
-
-                                    generated_text += content or ""
-                                elif usage := data.get("usage"):
-                                    output.output_tokens = usage.get(
-                                        "completion_tokens")
-
-                                most_recent_timestamp = timestamp
-
-                        output.generated_text = generated_text
-                        output.success = True
-                        output.latency = most_recent_timestamp - st
-                    else:
-                        output.error = response.reason or ""
-                        output.success = False
-            except Exception:
-                output.success = False
-                exc_info = sys.exc_info()
-                output.error = "".join(traceback.format_exception(*exc_info))
-
-        if pbar:
-            pbar.update(1)
-        return output
-
-
 def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download
@ -599,14 +485,7 @@ ASYNC_REQUEST_FUNCS = {
    "deepspeed-mii": async_request_deepspeed_mii,
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
-    "openai-audio": async_request_openai_audio,
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
 }
-
-OPENAI_COMPATIBLE_BACKENDS = [
-    k for k, v in ASYNC_REQUEST_FUNCS.items()
-    if v in (async_request_openai_completions,
-             async_request_openai_chat_completions)
-]
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -23,8 +23,7 @@ from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass
 from functools import cache
-from io import BytesIO
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union

 import numpy as np
 import pandas as pd
@ -64,7 +63,6 @@ class SampleRequest:

 class BenchmarkDataset(ABC):
    DEFAULT_SEED = 0
-    IS_MULTIMODAL = False

    def __init__(
        self,
@ -241,24 +239,21 @@ def process_image(image: Any) -> Mapping[str, Any]:
    """
    Process a single image input and return a multimedia content dictionary.

-    Supports three input types:
+    For a PIL.Image.Image input:
+      - Converts the image to RGB.
+      - Saves the image as a JPEG in-memory.
+      - Encodes the JPEG data as a base64 string.
+      - Returns a dictionary with the image as a base64 data URL.

-    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
-       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
-
-    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
-       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
-       a dictionary with the image as a base64 data URL.
-
-    3. String input: - Treats the string as a URL or local file path.  -
-       Prepends "file://" if the string doesn't start with "http://" or
-       "file://".  - Returns a dictionary with the image URL.
+    For a string input:
+      - Treats the string as a URL or file path.
+      - Prepends "file://" if the string doesn't start with "http://" or
+        "file://".
+      - Returns a dictionary with the image URL.

    Raises:
-        ValueError: If the input is not a supported type.
+      ValueError: If the input is neither a PIL.Image.Image nor a string.
    """
-    if isinstance(image, dict) and 'bytes' in image:
-        image = Image.open(BytesIO(image['bytes']))
    if isinstance(image, Image.Image):
        image = image.convert("RGB")
        with io.BytesIO() as image_data:
@ -277,8 +272,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
            ("http://", "file://")) else f"file://{image}")
        return {"type": "image_url", "image_url": {"url": image_url}}

-    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
-                     " or str or dictionary with raw image bytes.")
+    raise ValueError(
+        f"Invalid image input {image}. Must be a PIL.Image.Image or str.")


 # -----------------------------------------------------------------------------
@ -289,7 +284,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
 class RandomDataset(BenchmarkDataset):
    # Default values copied from benchmark_serving.py for the random dataset.
    DEFAULT_PREFIX_LEN = 0
-    DEFAULT_RANGE_RATIO = 0.0
+    DEFAULT_RANGE_RATIO = 1.0
    DEFAULT_INPUT_LEN = 1024
    DEFAULT_OUTPUT_LEN = 128

@ -309,32 +304,19 @@ class RandomDataset(BenchmarkDataset):
        output_len: int = DEFAULT_OUTPUT_LEN,
        **kwargs,
    ) -> list[SampleRequest]:
-        # Enforce range_ratio < 1
-        assert range_ratio < 1.0, (
-            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
-        )
-
        vocab_size = tokenizer.vocab_size

        prefix_token_ids = (np.random.randint(
            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])

-        # New sampling logic: [X * (1 - b), X * (1 + b)]
-        input_low = int(input_len * (1 - range_ratio))
-        input_high = int(input_len * (1 + range_ratio))
-        output_low = int(output_len * (1 - range_ratio))
-        output_high = int(output_len * (1 + range_ratio))
-
-        # Add logging for debugging
-        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
-        logger.info("Sampling output_len from [%s, %s]", output_low,
-                    output_high)
+        input_low = int(input_len * range_ratio)
+        output_low = int(output_len * range_ratio)

        input_lens = np.random.randint(input_low,
-                                       input_high + 1,
+                                       input_len + 1,
                                       size=num_requests)
        output_lens = np.random.randint(output_low,
-                                        output_high + 1,
+                                        output_len + 1,
                                        size=num_requests)
        offsets = np.random.randint(0, vocab_size, size=num_requests)

@ -486,11 +468,11 @@ class SonnetDataset(BenchmarkDataset):

        # Determine how many poem lines to use.
        num_input_lines = round((input_len - base_offset) / avg_len)
-        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
+        num_prefix_lines = round((prefix_len - base_offset) / avg_len)
        prefix_lines = self.data[:num_prefix_lines]

        samples = []
-        while len(samples) < num_requests:
+        for _ in range(num_requests):
            extra_lines = random.choices(self.data,
                                         k=num_input_lines - num_prefix_lines)
            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
@ -498,14 +480,13 @@ class SonnetDataset(BenchmarkDataset):
            prompt_formatted = tokenizer.apply_chat_template(
                msg, add_generation_prompt=True, tokenize=False)
            prompt_len = len(tokenizer(prompt_formatted).input_ids)
-            if prompt_len <= input_len:
-                samples.append(
-                    SampleRequest(
-                        prompt=prompt_formatted
-                        if return_prompt_formatted else prompt,
-                        prompt_len=prompt_len,
-                        expected_output_len=output_len,
-                    ))
+            samples.append(
+                SampleRequest(
+                    prompt=prompt_formatted
+                    if return_prompt_formatted else prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
        return samples


@ -581,48 +562,48 @@ class BurstGPTDataset(BenchmarkDataset):


 # -----------------------------------------------------------------------------
-# HuggingFace Dataset Base Implementation
+# HuggingFace Dataset Implementation
 # -----------------------------------------------------------------------------
-class HuggingFaceDataset(BenchmarkDataset):
-    """Base class for datasets hosted on HuggingFace."""

-    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
+
+class HuggingFaceDataset(BenchmarkDataset):
+    """
+    Dataset class for processing a HuggingFace dataset with conversation data
+    and optional images.
+    """

    def __init__(
        self,
-        dataset_path: str,
        dataset_split: str,
        dataset_subset: Optional[str] = None,
        **kwargs,
    ) -> None:
-        super().__init__(dataset_path=dataset_path, **kwargs)
-
+        super().__init__(**kwargs)
        self.dataset_split = dataset_split
        self.dataset_subset = dataset_subset
+
        self.load_data()

    def load_data(self) -> None:
-        """Load data from HuggingFace datasets."""
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided for loading data.")
+
        self.data = load_dataset(
            self.dataset_path,
            name=self.dataset_subset,
            split=self.dataset_split,
            streaming=True,
        )
-        self.data = self.data.shuffle(seed=self.random_seed)
-
-
-# -----------------------------------------------------------------------------
-# Conversation Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ConversationDataset(HuggingFaceDataset):
-    """Dataset for conversation data with multimodal support."""
-    SUPPORTED_DATASET_PATHS = {
-        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
-    }
-    IS_MULTIMODAL = True
+        if self.data.features is None or "conversations" \
+            not in self.data.features:
+            raise ValueError(
+                "HuggingFaceDataset currently only supports datasets with "
+                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
+                "Please consider contributing if you would like to add "
+                "support for additional dataset formats.")
+        # Shuffle and filter examples with at least 2 conversations.
+        self.data = self.data.shuffle(seed=self.random_seed).filter(
+            lambda x: len(x["conversations"]) >= 2)

    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
@ -630,13 +611,10 @@ class ConversationDataset(HuggingFaceDataset):
               output_len: Optional[int] = None,
               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
-        # Filter examples with at least 2 conversations
-        filtered_data = self.data.filter(
-            lambda x: len(x["conversations"]) >= 2)
        sampled_requests = []
        dynamic_output = output_len is None

-        for item in filtered_data:
+        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            conv = item["conversations"]
@ -681,13 +659,29 @@ class VisionArenaDataset(HuggingFaceDataset):
    """

    DEFAULT_OUTPUT_LEN = 128
-    SUPPORTED_DATASET_PATHS = {
-        "lmarena-ai/VisionArena-Chat":
-        lambda x: x["conversation"][0][0]["content"],
-        "lmarena-ai/vision-arena-bench-v0.1":
-        lambda x: x["turns"][0][0]["content"]
-    }
-    IS_MULTIMODAL = True
+    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
+            raise ValueError(f"Only support Vision Arena dataset.\
+                    This data path {self.dataset_path} is not valid.")
+        if self.dataset_subset is None and self.dataset_split != "train":
+            raise ValueError("Dataset split must be 'train'.")
+
+        self.load_data()
+
+    def load_data(self) -> None:
+        dataset = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = dataset.shuffle(seed=self.random_seed)

    def sample(
        self,
@ -703,11 +697,7 @@ class VisionArenaDataset(HuggingFaceDataset):
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
-            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
-            if parser_fn is None:
-                raise ValueError(
-                    f"Unsupported dataset path: {self.dataset_path}")
-            prompt = parser_fn(item)
+            prompt = item["turns"][0][0]["content"]
            mm_content = process_image(item["images"][0])
            prompt_len = len(tokenizer(prompt).input_ids)
            if enable_multimodal_chat:
@ -725,173 +715,3 @@ class VisionArenaDataset(HuggingFaceDataset):
                ))
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Instruct Coder Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class InstructCoderDataset(HuggingFaceDataset):
-    """
-    InstructCoder Dataset.
-    https://huggingface.co/datasets/likaixin/InstructCoder
-
-    InstructCoder is the dataset designed for general code editing.  It consists
-    of 114,239 instruction-input-output triplets, and covers multiple distinct
-    code editing scenario.
-    """
-
-    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
-    SUPPORTED_DATASET_PATHS = {
-        "likaixin/InstructCoder",
-    }
-
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
-        sampled_requests = []
-        for item in self.data:
-            if len(sampled_requests) >= num_requests:
-                break
-            prompt = f"{item['instruction']}:\n{item['input']}"
-            prompt_len = len(tokenizer(prompt).input_ids)
-            sampled_requests.append(
-                SampleRequest(
-                    prompt=prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
-        return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# AIMO Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class AIMODataset(HuggingFaceDataset):
-    """
-    Dataset class for processing a AIMO dataset with reasoning questions.
-    """
-    SUPPORTED_DATASET_PATHS = {
-        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
-        "AI-MO/NuminaMath-CoT"
-    }
-
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               **kwargs) -> list:
-        sampled_requests = []
-        dynamic_output = output_len is None
-
-        for item in self.data:
-            if len(sampled_requests) >= num_requests:
-                break
-            prompt, completion = item['problem'], item["solution"]
-
-            prompt_ids = tokenizer(prompt).input_ids
-            completion_ids = tokenizer(completion).input_ids
-            prompt_len = len(prompt_ids)
-            completion_len = len(completion_ids)
-            output_len = completion_len if dynamic_output else output_len
-            assert isinstance(output_len, int) and output_len > 0
-            if dynamic_output and not is_valid_sequence(prompt_len,
-                                                        completion_len,
-                                                        max_prompt_len=2048,
-                                                        max_total_len=32000):
-                continue
-            sampled_requests.append(
-                SampleRequest(
-                    prompt=prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                    multi_modal_data=None,
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
-        return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# ASR Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ASRDataset(HuggingFaceDataset):
-    """
-    Dataset class for processing a ASR dataset for transcription.
-    Tested on the following set:
-
-    +----------------+----------------------------------------+--------------------------+-----------------------------+
-    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
-    +----------------+----------------------------------------+--------------------------+-----------------------------+
-    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
-    |                |                                        |                          | release3-speaker-adaptation |
-    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
-    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
-    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
-    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
-    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
-    +----------------+----------------------------------------+--------------------------+-----------------------------+
-
-    """ # noqa: E501
-    SUPPORTED_DATASET_PATHS = {
-        "openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
-        "edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
-    }
-
-    DEFAULT_OUTPUT_LEN = 128
-    IS_MULTIMODAL = True
-
-    # TODO Whisper-specific. Abstract interface when more models are supported.
-    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
-                              "<|notimestamps|>"
-    skip_long_audios: bool = True
-
-    def sample(
-        self,
-        tokenizer: PreTrainedTokenizerBase,
-        num_requests: int,
-        output_len: Optional[int] = None,
-        **kwargs,
-    ) -> list:
-        import librosa
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
-        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
-        prompt_len = len(tokenizer(prompt).input_ids)
-        sampled_requests = []
-        skipped = 0
-        for item in self.data:
-            if len(sampled_requests) >= num_requests:
-                break
-            audio = item["audio"]
-            y, sr = audio["array"], audio["sampling_rate"]
-            duration_s = librosa.get_duration(y=y, sr=sr)
-            # Whisper max supported duration
-            if self.skip_long_audios and duration_s > 30:
-                skipped += 1
-                continue
-
-            mm_content = {"audio": (y, sr)}
-            sampled_requests.append(
-                SampleRequest(
-                    prompt=prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                    multi_modal_data=mm_content,
-                ))
-        if skipped:
-            logger.warning("%d samples discarded from dataset due to" \
-                           " their length being greater than" \
-                           " what Whisper supports.", skipped)
-        self.maybe_oversample_requests(sampled_requests, num_requests)
-        return sampled_requests
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -63,16 +63,14 @@ class Request:
    output_len: int


-def sample_tokens(tokenizer: PreTrainedTokenizerBase,
-                  length: int) -> list[int]:
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
    vocab = tokenizer.get_vocab()
-    all_special_ids = set(tokenizer.all_special_ids)
-
    # Remove the special tokens.
-    return random.choices(
-        [v for k, v in vocab.items() if k not in all_special_ids],
-        k=length,
-    )
+    vocab = {
+        k: v
+        for k, v in vocab.items() if k not in tokenizer.all_special_ids
+    }
+    return random.choices(list(vocab.values()), k=length)


 def sample_requests_from_dataset(
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -7,6 +7,9 @@ On the server side, run one of the following commands:
        --swap-space 16 \
        --disable-log-requests

+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
 On the client side, run:
    python benchmarks/benchmark_serving.py \
        --backend <backend> \
@ -34,8 +37,7 @@ from datetime import datetime
 from typing import Any, Optional

 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS,
-                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
@ -50,11 +52,9 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

-from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
-                               ConversationDataset, HuggingFaceDataset,
-                               InstructCoderDataset, RandomDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
-                               VisionArenaDataset)
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json

 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -156,7 +156,7 @@ def calculate_metrics(
        if outputs[i].success:
            output_len = outputs[i].output_tokens

-            if not output_len:
+            if output_len is None:
                # We use the tokenizer to count the number of output tokens
                # for some serving backends instead of looking at
                # len(outputs[i].itl) since multiple output tokens may be
@ -261,7 +261,6 @@ async def benchmark(
    goodput_config_dict: dict[str, float],
    max_concurrency: Optional[int],
    lora_modules: Optional[Iterable[str]],
-    extra_body: Optional[dict],
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -274,6 +273,10 @@ async def benchmark(
        input_requests[0].expected_output_len, \
            input_requests[0].multi_modal_data

+    if backend != "openai-chat" and test_mm_content is not None:
+        # multi-modal benchmark is only available on OpenAI Chat backend.
+        raise ValueError(
+            "Multi-modal content is only supported on 'openai-chat' backend.")
    assert test_mm_content is None or isinstance(test_mm_content, dict)
    test_input = RequestFuncInput(
        model=model_id,
@ -285,7 +288,6 @@ async def benchmark(
        logprobs=logprobs,
        multi_modal_content=test_mm_content,
        ignore_eos=ignore_eos,
-        extra_body=extra_body,
    )

    test_output = await request_func(request_func_input=test_input)
@ -312,8 +314,7 @@ async def benchmark(
                                         output_len=test_output_len,
                                         logprobs=logprobs,
                                         multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos,
-                                         extra_body=extra_body)
+                                         ignore_eos=ignore_eos)
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")
@ -363,8 +364,7 @@ async def benchmark(
                                              output_len=output_len,
                                              logprobs=logprobs,
                                              multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos,
-                                              extra_body=extra_body)
+                                              ignore_eos=ignore_eos)
        tasks.append(
            asyncio.create_task(
                limited_request_func(request_func_input=request_func_input,
@ -586,49 +586,19 @@ def main(args: argparse.Namespace):
                                            return_prompt_formatted=True)

    elif args.dataset_name == "hf":
-        # all following datasets are implemented from the
-        # HuggingFaceDataset base class
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = VisionArenaDataset
-            args.hf_split = "train"
-            args.hf_subset = None
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = InstructCoderDataset
-            args.hf_split = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = ConversationDataset
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = AIMODataset
-            args.hf_split = "train"
-        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = ASRDataset
-            args.hf_split = "train"
-        else:
-            supported_datasets = set([
-                dataset_name for cls in HuggingFaceDataset.__subclasses__()
-                for dataset_name in cls.SUPPORTED_DATASET_PATHS
-            ])
-            raise ValueError(
-                f"Unsupported dataset path: {args.dataset_path}. "
-                "Huggingface dataset only supports dataset_path"
-                f" from one of following: {supported_datasets}. "
-                "Please consider contributing if you would "
-                "like to add support for additional dataset formats.")
-
-        if (dataset_class.IS_MULTIMODAL and backend not in \
-            ["openai-chat", "openai-audio"]):
-            # multi-modal benchmark is only available on OpenAI Chat backend.
-            raise ValueError(
-                "Multi-modal content is only supported on 'openai-chat' and " \
-                "'openai-audio' backend.")
+        # Choose between VisionArenaDataset
+        # and HuggingFaceDataset based on provided parameters.
+        dataset_class = (VisionArenaDataset if args.dataset_path
+                         == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                         and args.hf_subset is None else HuggingFaceDataset)
        input_requests = dataset_class(
            dataset_path=args.dataset_path,
            dataset_subset=args.hf_subset,
            dataset_split=args.hf_split,
-            random_seed=args.seed,
        ).sample(
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
+            random_seed=args.seed,
            output_len=args.hf_output_len,
        )

@ -663,26 +633,6 @@ def main(args: argparse.Namespace):
            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
    goodput_config_dict = check_goodput_args(args)

-    # Collect the sampling parameters.
-    sampling_params = {
-        k: v
-        for k, v in {
-            "top_p": args.top_p,
-            "top_k": args.top_k,
-            "min_p": args.min_p,
-            "temperature": args.temperature
-        }.items() if v is not None
-    }
-
-    # Sampling parameters are only supported by openai-compatible backend.
-    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
-        raise ValueError(
-            "Sampling parameters are only supported by openai-compatible "
-            "backends.")
-
-    if "temperature" not in sampling_params:
-        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
-
    # Avoid GC processing "static" data - reduce pause times.
    gc.collect()
    gc.freeze()
@ -709,11 +659,10 @@ def main(args: argparse.Namespace):
            goodput_config_dict=goodput_config_dict,
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
-            extra_body=sampling_params,
        ))

    # Save config and results to json
-    if args.save_result or args.append_result:
+    if args.save_result:
        result_json: dict[str, Any] = {}

        # Setup
@ -734,14 +683,6 @@ def main(args: argparse.Namespace):
                    raise ValueError(
                        "Invalid metadata format. Please use KEY=VALUE format."
                    )
-        # Traffic
-        result_json["request_rate"] = (args.request_rate if args.request_rate
-                                       < float("inf") else "inf")
-        result_json["burstiness"] = args.burstiness
-        result_json["max_concurrency"] = args.max_concurrency
-
-        # Merge with benchmark result
-        result_json = {**result_json, **benchmark_result}

        if not args.save_detailed:
            # Remove fields with too many data points
@ -752,6 +693,15 @@ def main(args: argparse.Namespace):
                if field in result_json:
                    del result_json[field]

+        # Traffic
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+
        # Save to file
        base_model_id = model_id.split("/")[-1]
        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
@ -761,12 +711,7 @@ def main(args: argparse.Namespace):
            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name,
-                  mode="a+" if args.append_result else "w",
-                  encoding='utf-8') as outfile:
-            # Append a newline.
-            if args.append_result and outfile.tell() != 0:
-                outfile.write("\n")
+        with open(file_name, "w", encoding='utf-8') as outfile:
            json.dump(result_json, outfile)
        save_to_pytorch_benchmark_format(args, result_json, file_name)

@ -898,11 +843,6 @@ if __name__ == "__main__":
        help="When saving the results, whether to include per request "
        "information such as response, error, ttfs, tpots, etc.",
    )
-    parser.add_argument(
-        "--append-result",
-        action="store_true",
-        help="Append the benchmark result to the existing json file.",
-    )
    parser.add_argument(
        "--metadata",
        metavar="KEY=VALUE",
@ -936,7 +876,7 @@ if __name__ == "__main__":
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-seperated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
        "Default value is \"ttft,tpot,itl\".")
@ -944,7 +884,7 @@ if __name__ == "__main__":
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-separated list of percentiles for selected metrics. "
+        help="Comma-seperated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
@ -1011,23 +951,18 @@ if __name__ == "__main__":
    random_group.add_argument(
        "--random-range-ratio",
        type=float,
-        default=0.0,
-        help="Range ratio for sampling input/output length, "
-        "used only for random sampling. Must be in the range [0, 1) to define "
-        "a symmetric sampling range"
-        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
    )
    random_group.add_argument(
        "--random-prefix-len",
        type=int,
        default=0,
-        help=("Number of fixed prefix tokens before the random context "
-              "in a request. "
-              "The total input length is the sum of `random-prefix-len` and "
-              "a random "
-              "context length sampled from [input_len * (1 - range_ratio), "
-              "input_len * (1 + range_ratio)]."),
-    )
+        help="Number of fixed prefix tokens before random "
+        " context. The length range of context in a random "
+        " request is [random-prefix-len, "
+        " random-prefix-len + random-prefix-len * random-range-ratio).")

    hf_group = parser.add_argument_group("hf dataset options")
    hf_group.add_argument("--hf-subset",
@ -1046,33 +981,6 @@ if __name__ == "__main__":
        "from the sampled HF dataset.",
    )

-    sampling_group = parser.add_argument_group("sampling parameters")
-    sampling_group.add_argument(
-        "--top-p",
-        type=float,
-        default=None,
-        help="Top-p sampling parameter. Only has effect on openai-compatible "
-        "backends.")
-    sampling_group.add_argument(
-        "--top-k",
-        type=int,
-        default=None,
-        help="Top-k sampling parameter. Only has effect on openai-compatible "
-        "backends.")
-    sampling_group.add_argument(
-        "--min-p",
-        type=float,
-        default=None,
-        help="Min-p sampling parameter. Only has effect on openai-compatible "
-        "backends.")
-    sampling_group.add_argument(
-        "--temperature",
-        type=float,
-        default=None,
-        help="Temperature sampling parameter. Only has effect on "
-        "openai-compatible backends. If not specified, default to greedy "
-        "decoding (i.e. temperature==0.0).")
-
    parser.add_argument(
        '--tokenizer-mode',
        type=str,
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -5,13 +5,16 @@ On the server side, run one of the following commands:
    (vLLM OpenAI API server)
    vllm serve <your_model> --disable-log-requests

+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
 On the client side, run:
    python benchmarks/benchmark_serving_structured_output.py \
        --backend <backend> \
        --model <your_model> \
        --dataset json \
        --structured-output-ratio 1.0 \
-        --structured-output-backend auto \
+        --structured-output-backend xgrammar \
        --request-rate 10 \
        --num-prompts 1000

@ -51,7 +54,7 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

-from vllm.v1.structured_output.backend_xgrammar import (
+from vllm.v1.structured_output.utils import (
    has_xgrammar_unsupported_json_features)

 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -130,11 +133,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                        "description":
                        "An unique optional field to avoid cached schemas"
                    }
-        else:
-            json_schemas = [schema] * args.num_prompts

        def gen_prompt(index: int):
-            return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
+            schema = json_schemas[index % len(json_schemas)]
+            return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501

        def get_schema(index: int):
            return json_schemas[index % len(json_schemas)]
@ -150,17 +152,17 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,

    elif args.dataset == "grammar":
        schema = """
-        root ::= select_statement
+            ?start: select_statement

-        select_statement ::= "SELECT " column " from " table " where " condition
+            ?select_statement: "SELECT " column_list " FROM " table_name

-        column ::= "col_1 " | "col_2 "
+            ?column_list: column_name ("," column_name)*

-        table ::= "table_1 " | "table_2 "
+            ?table_name: identifier

-        condition ::= column "= " number
+            ?column_name: identifier

-        number ::= "1 " | "2 "
+            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
        """
        prompt = "Generate an SQL query to show the 'username' \
            and 'email' from the 'users' table."
@ -964,7 +966,7 @@ if __name__ == "__main__":
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-seperated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
        "Default value is \"ttft,tpot,itl\".")
@ -972,7 +974,7 @@ if __name__ == "__main__":
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-separated list of percentiles for selected metrics. "
+        help="Comma-seperated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
@ -997,14 +999,12 @@ if __name__ == "__main__":
                        type=float,
                        default=1.0,
                        help="Ratio of Structured Outputs requests")
-    parser.add_argument("--structured-output-backend",
-                        type=str,
-                        choices=[
-                            "outlines", "lm-format-enforcer", "xgrammar",
-                            "guidance", "auto"
-                        ],
-                        default="auto",
-                        help="Backend to use for structured outputs")
+    parser.add_argument(
+        "--structured-output-backend",
+        type=str,
+        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
+        default="xgrammar",
+        help="Backend to use for structured outputs")

    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -11,8 +11,7 @@ from typing import Any, Optional, Union

 import torch
 import uvloop
-from benchmark_dataset import (AIMODataset, BurstGPTDataset,
-                               ConversationDataset, InstructCoderDataset,
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
                               RandomDataset, SampleRequest, ShareGPTDataset,
                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -213,17 +212,14 @@ def run_hf(
    max_prompt_len = 0
    max_output_len = 0
    for i in range(len(requests)):
-        prompt = requests[i].prompt
-        prompt_len = requests[i].prompt_len
-        output_len = requests[i].expected_output_len
+        prompt, prompt_len, output_len = requests[i]
        # Add the prompt to the batch.
        batch.append(prompt)
        max_prompt_len = max(max_prompt_len, prompt_len)
        max_output_len = max(max_output_len, output_len)
        if len(batch) < max_batch_size and i != len(requests) - 1:
            # Check if we can add more requests to the batch.
-            next_prompt_len = requests[i + 1].prompt_len
-            next_output_len = requests[i + 1].expected_output_len
+            _, next_prompt_len, next_output_len = requests[i + 1]
            if (max(max_prompt_len, next_prompt_len) +
                    max(max_output_len, next_output_len)) <= 2048:
                # We can add more requests to the batch.
@ -304,7 +300,6 @@ def get_requests(args, tokenizer):
        "input_len": args.input_len,
        "output_len": args.output_len,
    }
-
    if args.dataset_path is None or args.dataset_name == "random":
        sample_kwargs["range_ratio"] = args.random_range_ratio
        sample_kwargs["prefix_len"] = args.prefix_len
@ -322,23 +317,18 @@ def get_requests(args, tokenizer):
    elif args.dataset_name == "burstgpt":
        dataset_cls = BurstGPTDataset
    elif args.dataset_name == "hf":
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = VisionArenaDataset
-            common_kwargs['dataset_subset'] = None
-            common_kwargs['dataset_split'] = "train"
-            sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = InstructCoderDataset
-            common_kwargs['dataset_split'] = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = ConversationDataset
-            common_kwargs['dataset_subset'] = args.hf_subset
-            common_kwargs['dataset_split'] = args.hf_split
-            sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = AIMODataset
-            common_kwargs['dataset_subset'] = None
-            common_kwargs['dataset_split'] = "train"
+        if args.backend != "vllm-chat":
+            raise ValueError(
+                "hf datasets only are supported by vllm-chat backend")
+        # Choose between VisionArenaDataset and HuggingFaceDataset based on
+        # provided parameters.
+        dataset_cls = (VisionArenaDataset if args.dataset_path
+                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                       and args.hf_subset is None else HuggingFaceDataset)
+        common_kwargs['dataset_subset'] = args.hf_subset
+        common_kwargs['dataset_split'] = args.hf_split
+        sample_kwargs["enable_multimodal_chat"] = True
+
    else:
        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
    # Remove None values
@ -472,17 +462,9 @@ def validate_args(args):
        warnings.warn("--hf-subset and --hf-split will be ignored \
                since --dataset-name is not 'hf'.",
                      stacklevel=2)
-    elif args.dataset_name == "hf":
-        if args.dataset_path in (
-                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
-                | ConversationDataset.SUPPORTED_DATASET_PATHS):
-            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
-        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
-                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
-            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
-        else:
-            raise ValueError(
-                f"{args.dataset_path} is not supported by hf dataset.")
+    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
+        raise ValueError(
+            "When --dataset-name is 'hf', backend must be 'vllm-chat'")

    # --random-range-ratio: only used when dataset_name is 'random'
    if args.dataset_name != 'random' and args.random_range_ratio is not None:
@ -523,13 +505,6 @@ def validate_args(args):
        raise ValueError(
            "Tokenizer must be the same as the model for MII backend.")

-    # --data-parallel is not supported currently.
-    # https://github.com/vllm-project/vllm/issues/16222
-    if args.data_parallel_size > 1:
-        raise ValueError(
-            "Data parallel is not supported in offline benchmark, \
-            please use benchmark serving instead")
-

 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
@ -601,30 +576,18 @@ if __name__ == "__main__":
        default=None,
        help="Path to the lora adapters to use. This can be an absolute path, "
        "a relative path, or a Hugging Face model identifier.")
-    parser.add_argument(
-        "--prefix-len",
-        type=int,
-        default=None,
-        help=f"Number of prefix tokens to be used in RandomDataset "
-        "and SonnetDataset. For RandomDataset, the total input "
-        "length is the sum of prefix-len (default: "
-        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
-        "sampled from [input_len * (1 - range_ratio), "
-        "input_len * (1 + range_ratio)]. For SonnetDataset, "
-        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
-        "controls how much of the input is fixed lines versus "
-        "random lines, but the total input length remains approximately "
-        "input_len tokens.")
+    parser.add_argument("--prefix-len",
+                        type=int,
+                        default=None,
+                        help="Number of prefix tokens per request."
+                        "This is for the RandomDataset and SonnetDataset")
    # random dataset
    parser.add_argument(
        "--random-range-ratio",
        type=float,
        default=None,
-        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
-        "for sampling input/output length, "
-        "used only for RandomDataset. Must be in the range [0, 1) to "
-        "define a symmetric sampling range "
-        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+        help="Range of sampled ratio of input/output length, "
+        "used only for RandomDataSet.",
    )

    # hf dtaset
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@ -1,236 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
-    MINIMUM_BITBLAS_VERSION)
-
-try:
-    import bitblas
-    if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
-        raise ImportError("bitblas version is wrong. Please "
-                          f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
-except ImportError as e:
-    bitblas_import_exception = e
-    raise ValueError("Trying to use the bitblas backend, but could not import"
-                     f"with the following error: {bitblas_import_exception}. "
-                     "Please install bitblas through the following command: "
-                     f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
-                     ) from bitblas_import_exception
-
-from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
-
-from vllm.utils import FlexibleArgumentParser
-
-parser = FlexibleArgumentParser(
-    description="Benchmark BitBLAS int4 on a specific target.")
-
-# Add arguments to the parser
-parser.add_argument(
-    "--target",
-    type=str,
-    default=auto_detect_nvidia_target(),
-    help="Specify the target device for benchmarking.",
-)
-parser.add_argument("--group_size",
-                    type=int,
-                    default=None,
-                    help="Group size for grouped quantization.")
-parser.add_argument(
-    "--A_dtype",
-    type=str,
-    default="float16",
-    choices=["float16", "float32", "float64", "int32", "int8"],
-    help="Data type of activation A.",
-)
-parser.add_argument(
-    "--W_dtype",
-    type=str,
-    default="int4",
-    choices=[
-        "float16",
-        "float32",
-        "float64",
-        "int32",
-        "int8",
-        "int4",
-        "int2",
-        "int1",
-        "nf4",
-        "fp4_e2m1",
-    ],
-    help="Data type of weight W.",
-)
-parser.add_argument(
-    "--accum_dtype",
-    type=str,
-    default="float16",
-    choices=["float16", "int32"],
-    help="Data type for accumulation.",
-)
-parser.add_argument(
-    "--out_dtype",
-    type=str,
-    default="float16",
-    choices=["float16", "float32", "int32", "int8"],
-    help="Data type for output.",
-)
-parser.add_argument(
-    "--layout",
-    type=str,
-    default="nt",
-    choices=["nt", "nn"],
-    help="Matrix layout, 'nt' for non-transpose A and transpose W.",
-)
-parser.add_argument("--with_bias",
-                    action="store_true",
-                    help="Include bias in the benchmark.")
-parser.add_argument(
-    "--with_scaling",
-    action="store_true",
-    help="Include scaling factor in the quantization.",
-)
-parser.add_argument("--with_zeros",
-                    action="store_true",
-                    help="Include zeros in the quantization.")
-parser.add_argument(
-    "--zeros_mode",
-    type=str,
-    default=None,
-    choices=["original", "rescale", "quantized"],
-    help="Specify the mode for calculating zeros.",
-)
-
-# Parse the arguments
-args = parser.parse_args()
-
-# Assign arguments to variables
-target = args.target
-A_dtype = args.A_dtype
-W_dtype = args.W_dtype
-accum_dtype = args.accum_dtype
-out_dtype = args.out_dtype
-layout = args.layout
-with_bias = args.with_bias
-group_size = args.group_size
-with_scaling = args.with_scaling
-with_zeros = args.with_zeros
-zeros_mode = args.zeros_mode
-
-# Define a list of shared arguments that repeat in every config
-shared_args = [
-    A_dtype,
-    W_dtype,
-    out_dtype,
-    accum_dtype,
-    layout,
-    with_bias,
-    group_size,
-    with_scaling,
-    with_zeros,
-    zeros_mode,
-]
-
-# Define just the (M, K, N) shapes in a more compact list
-shapes = [
-    # square test
-    (1, 16384, 16384),
-    # BLOOM-176B
-    (1, 43008, 14336),
-    (1, 14336, 14336),
-    (1, 57344, 14336),
-    (1, 14336, 57344),
-    # OPT-65B
-    (1, 9216, 9216),
-    (1, 36864, 9216),
-    (1, 9216, 36864),
-    (1, 22016, 8192),
-    # LLAMA-70B/65B
-    (1, 8192, 22016),
-    (1, 8192, 8192),
-    (1, 28672, 8192),
-    (1, 8192, 28672),
-    # square test
-    (16384, 16384, 16384),
-    # BLOOM-176B
-    (8192, 43008, 14336),
-    (8192, 14336, 14336),
-    (8192, 57344, 14336),
-    (8192, 14336, 57344),
-    # OPT-65B
-    (8192, 9216, 9216),
-    (8192, 36864, 9216),
-    (8192, 9216, 36864),
-    (8192, 22016, 8192),
-    # LLAMA-70B/65B
-    (8192, 8192, 22016),
-    (8192, 8192, 8192),
-    (8192, 28672, 8192),
-    (8192, 8192, 28672),
-]
-
-# Build test shapes with all the shared arguments
-test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args))
-               for shape in shapes]
-
-benchmark_sets = []
-benchmark_sets.extend(test_shapes)
-
-benchmark_results = {}
-for config_class, operator, input_args in benchmark_sets:
-    config = config_class(*input_args)
-    matmul = operator(config, target=target, enable_tuning=True)
-    kernel_latency = matmul.profile_latency()
-
-    print("Time cost is: {:.3f} ms".format(kernel_latency))
-
-    profile_config = {
-        f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "BitBLAS_top20_latency": kernel_latency,
-        }
-    }
-
-    benchmark_results.update(profile_config)
-
-# Define headers for the table
-headers = [
-    "PrimFunc",
-    "Input Arguments",
-    "BitBLAS Top20 Latency",
-]
-
-# Calculate column widths for pretty printing
-col_widths = [0, 0, 0]
-for config_key, values in benchmark_results.items():
-    args_split = config_key.split("-")
-    func_name = args_split[0]
-    input_args_str = "-".join(args_split[1:])
-    col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
-    col_widths[1] = max(col_widths[1],
-                        len(input_args_str) + 2,
-                        len(headers[1]) + 2)
-    col_widths[2] = max(col_widths[2],
-                        len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
-                        len(headers[2]) + 2)
-    # break only if you want to measure widths from a single example;
-    # otherwise, let it loop over all items.
-
-# Print header
-for i, header in enumerate(headers):
-    headers[i] = header.ljust(col_widths[i])
-print("".join(headers))
-print("-" * sum(col_widths))
-
-# Print rows
-for config_key, values in benchmark_results.items():
-    args_split = config_key.split("-")
-    func_name = args_split[0]
-    input_args_str = "-".join(args_split[1:])
-    row = [
-        func_name,
-        input_args_str,
-        f"{values['BitBLAS_top20_latency']:.3f} ms",
-    ]
-    row_str = "".join(
-        [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)])
-    print(row_str)
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -1,340 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-import torch.utils.benchmark as benchmark
-from benchmark_shapes import WEIGHT_SHAPES_MOE
-
-from vllm import _custom_ops as ops
-from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
-                                                            fused_experts,
-                                                            fused_topk)
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = [
-    "nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite",
-    "ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m"
-]
-DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-PER_ACT_TOKEN_OPTS = [False]
-PER_OUT_CH_OPTS = [False]
-
-
-def to_fp8(tensor: torch.Tensor):
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def bench_run(results: list[benchmark.Measurement], model: str,
-              num_experts: int, topk: int, per_act_token: bool,
-              per_out_ch: bool, mkn: tuple[int, int, int]):
-    label = "Quant Matmul"
-
-    sub_label = (
-        "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, "
-        "MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch,
-                          mkn))
-
-    print(f"Testing: {sub_label}")
-
-    (m, k, n) = mkn
-
-    dtype = torch.half
-
-    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
-    w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
-
-    _, a_scale = ops.scaled_fp8_quant(a)
-
-    w1_q = torch.empty((num_experts, 2 * n, k),
-                       device="cuda",
-                       dtype=torch.float8_e4m3fn)
-    w2_q = torch.empty((num_experts, k, n),
-                       device="cuda",
-                       dtype=torch.float8_e4m3fn)
-    w1_scale = torch.empty((num_experts, 1, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-    w2_scale = torch.empty((num_experts, 1, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-
-    ab_strides1 = torch.full((num_experts, ),
-                             k,
-                             device="cuda",
-                             dtype=torch.int64)
-    c_strides1 = torch.full((num_experts, ),
-                            2 * n,
-                            device="cuda",
-                            dtype=torch.int64)
-    ab_strides2 = torch.full((num_experts, ),
-                             n,
-                             device="cuda",
-                             dtype=torch.int64)
-    c_strides2 = torch.full((num_experts, ),
-                            k,
-                            device="cuda",
-                            dtype=torch.int64)
-
-    for expert in range(num_experts):
-        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
-        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
-    w1_q_notransp = w1_q.clone()
-    w2_q_notransp = w2_q.clone()
-    w1_q = w1_q.transpose(1, 2)
-    w2_q = w2_q.transpose(1, 2)
-
-    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
-
-    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
-
-    def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
-                       topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                       w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-                       a_scale: torch.Tensor, num_repeats: int):
-        for _ in range(num_repeats):
-            fused_experts(a,
-                          w1,
-                          w2,
-                          topk_weights,
-                          topk_ids,
-                          use_fp8_w8a8=True,
-                          w1_scale=w1_scale,
-                          w2_scale=w2_scale,
-                          a1_scale=a_scale)
-
-    def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor,
-                        w1: torch.Tensor, w2: torch.Tensor,
-                        w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-                        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
-                        ab_strides2: torch.Tensor, c_strides2: torch.Tensor,
-                        num_repeats: int):
-        for _ in range(num_repeats):
-            cutlass_moe_fp8(a,
-                            w1,
-                            w2,
-                            w1_scale,
-                            w2_scale,
-                            topk_weights,
-                            topk_ids,
-                            ab_strides1,
-                            c_strides1,
-                            ab_strides2,
-                            c_strides2,
-                            a1_scale=a_scale)
-
-    def run_cutlass_from_graph(
-            a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
-            w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-            ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
-            ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
-        with set_current_vllm_config(
-                VllmConfig(parallel_config=ParallelConfig(
-                    pipeline_parallel_size=1))):
-            return cutlass_moe_fp8(a,
-                                   w1_q,
-                                   w2_q,
-                                   w1_scale,
-                                   w2_scale,
-                                   topk_weights,
-                                   topk_ids,
-                                   ab_strides1,
-                                   c_strides1,
-                                   ab_strides2,
-                                   c_strides2,
-                                   a1_scale=a_scale)
-
-    def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor,
-                              w2: torch.Tensor, topk_weights: torch.Tensor,
-                              topk_ids: torch.Tensor, w1_scale: torch.Tensor,
-                              w2_scale: torch.Tensor, a_scale: torch.Tensor):
-        with set_current_vllm_config(
-                VllmConfig(parallel_config=ParallelConfig(
-                    pipeline_parallel_size=1))):
-            return fused_experts(a,
-                                 w1,
-                                 w2,
-                                 topk_weights,
-                                 topk_ids,
-                                 use_fp8_w8a8=True,
-                                 w1_scale=w1_scale,
-                                 w2_scale=w2_scale,
-                                 a1_scale=a_scale)
-
-    def replay_graph(graph, num_repeats):
-        for _ in range(num_repeats):
-            graph.replay()
-        torch.cuda.synchronize()
-
-    cutlass_stream = torch.cuda.Stream()
-    cutlass_graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
-        run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale,
-                               topk_weights, topk_ids, ab_strides1, c_strides1,
-                               ab_strides2, c_strides2)
-    torch.cuda.synchronize()
-
-    triton_stream = torch.cuda.Stream()
-    triton_graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(triton_graph, stream=triton_stream):
-        run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights,
-                              topk_ids, w1_scale, w2_scale, a_scale)
-    torch.cuda.synchronize()
-
-    min_run_time = 5
-    num_warmup = 5
-    num_runs = 25
-
-    globals = {
-        # Baseline params
-        "w1": w1,
-        "w2": w2,
-        "score": score,
-        "topk": topk,
-        "w1_q_notransp": w1_q_notransp,
-        "w2_q_notransp": w2_q_notransp,
-        # Cutlass params
-        "a_scale": a_scale,
-        "w1_q": w1_q,
-        "w2_q": w2_q,
-        "w1_scale": w1_scale,
-        "w2_scale": w2_scale,
-        "ab_strides1": ab_strides1,
-        "c_strides1": c_strides1,
-        "ab_strides2": ab_strides2,
-        "c_strides2": c_strides2,
-        # cuda graph params
-        "cutlass_graph": cutlass_graph,
-        "triton_graph": triton_graph,
-        # Gen params
-        "a": a,
-        "topk_weights": topk_weights,
-        "topk_ids": topk_ids,
-        "num_runs": num_runs,
-        # Kernels
-        "run_triton_moe": run_triton_moe,
-        "run_cutlass_moe": run_cutlass_moe,
-        "replay_graph": replay_graph,
-    }
-
-    # Warmup
-    run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids,
-                   w1_scale, w2_scale, a_scale, num_warmup)
-
-    results.append(
-        benchmark.Timer(
-            stmt=
-            "run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="triton_moe",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    # Warmup
-    replay_graph(triton_graph, num_warmup)
-
-    results.append(
-        benchmark.Timer(
-            stmt="replay_graph(triton_graph, num_runs)",
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="triton_moe_cuda_graphs",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    # Warmup
-    run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights,
-                    topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2,
-                    num_warmup)
-
-    results.append(
-        benchmark.Timer(
-            stmt=
-            "run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="grouped_gemm_moe",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    # Warmup
-    replay_graph(cutlass_graph, num_warmup)
-
-    results.append(
-        benchmark.Timer(
-            stmt="replay_graph(cutlass_graph, num_runs)",
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="grouped_gemm_moe_cuda_graphs",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-
-def main(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    results: list[benchmark.Measurement] = []
-
-    for model in args.models:
-        for tp in args.tp_sizes:
-            for layer in WEIGHT_SHAPES_MOE[model]:
-                num_experts = layer[0]
-                topk = layer[1]
-                size_k = layer[2]
-                size_n = layer[3] // tp
-
-                if len(args.limit_k) > 0 and size_k not in args.limit_k:
-                    continue
-
-                if len(args.limit_n) > 0 and size_n not in args.limit_n:
-                    continue
-
-                for per_act_token in PER_ACT_TOKEN_OPTS:
-                    for per_out_ch in PER_OUT_CH_OPTS:
-                        for size_m in DEFAULT_BATCH_SIZES:
-                            mkn = (size_m, size_k, size_n)
-                            bench_run(results, model, num_experts, topk,
-                                      per_act_token, per_out_ch, mkn)
-
-    compare = benchmark.Compare(results)
-    compare.print()
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description="Benchmark Marlin across specified models/shapes/batches")
-    parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=DEFAULT_MODELS,
-        choices=WEIGHT_SHAPES_MOE.keys(),
-    )
-    parser.add_argument("--tp-sizes",
-                        nargs="+",
-                        type=int,
-                        default=DEFAULT_TP_SIZES)
-    parser.add_argument("--batch-sizes",
-                        nargs="+",
-                        type=int,
-                        default=DEFAULT_BATCH_SIZES)
-    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-per-act-token",
-                        nargs="+",
-                        type=int,
-                        default=[])
-    parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
-
-    args = parser.parse_args()
-    main(args)
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -17,14 +17,8 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES

-from vllm.triton_utils import HAS_TRITON
-
-if HAS_TRITON:
-    from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
-                                          lora_shrink)
-    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
-                                                _LORA_B_PTR_DICT)
-
+from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -30,18 +30,19 @@ class BenchmarkConfig(TypedDict):
    num_stages: int


-def benchmark_config(config: BenchmarkConfig,
-                     num_tokens: int,
-                     num_experts: int,
-                     shard_intermediate_size: int,
-                     hidden_size: int,
-                     topk: int,
-                     dtype: torch.dtype,
-                     use_fp8_w8a8: bool,
-                     use_int8_w8a16: bool,
-                     num_iters: int = 100,
-                     block_quant_shape: List[int] = None,
-                     use_deep_gemm: bool = False) -> float:
+def benchmark_config(
+    config: BenchmarkConfig,
+    num_tokens: int,
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    num_iters: int = 100,
+    block_quant_shape: List[int] = None,
+) -> float:
    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
    if use_int8_w8a16:
@ -114,41 +115,22 @@ def benchmark_config(config: BenchmarkConfig,
    def run():
        from vllm.model_executor.layers.fused_moe import override_config
        with override_config(config):
-            if use_deep_gemm:
-                topk_weights, topk_ids = fused_topk(x, input_gating, topk,
-                                                    False)
-                return fused_experts(
-                    x,
-                    w1,
-                    w2,
-                    topk_weights,
-                    topk_ids,
-                    inplace=True,
-                    use_fp8_w8a8=use_fp8_w8a8,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    a1_scale=a1_scale,
-                    a2_scale=a2_scale,
-                    block_shape=block_quant_shape,
-                    allow_deep_gemm=True,
-                )
-            else:
-                fused_moe(
-                    x,
-                    w1,
-                    w2,
-                    input_gating,
-                    topk,
-                    renormalize=True,
-                    inplace=True,
-                    use_fp8_w8a8=use_fp8_w8a8,
-                    use_int8_w8a16=use_int8_w8a16,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    a1_scale=a1_scale,
-                    a2_scale=a2_scale,
-                    block_shape=block_quant_shape,
-                )
+            fused_moe(
+                x,
+                w1,
+                w2,
+                input_gating,
+                topk,
+                renormalize=True,
+                inplace=True,
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                block_shape=block_quant_shape,
+            )

    # JIT compilation & warmup
    run()
@ -384,7 +366,6 @@ class BenchmarkWorker:
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
        block_quant_shape: List[int] = None,
-        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
        current_platform.seed_everything(self.seed)
        dtype_str = get_config_dtype_str(dtype,
@ -415,8 +396,7 @@ class BenchmarkWorker:
                                       use_fp8_w8a8,
                                       use_int8_w8a16,
                                       num_iters=100,
-                                       block_quant_shape=block_quant_shape,
-                                       use_deep_gemm=use_deep_gemm)
+                                       block_quant_shape=block_quant_shape)
        return config, kernel_time

    def tune(
@ -431,7 +411,6 @@ class BenchmarkWorker:
        use_int8_w8a16: bool,
        search_space: list[dict[str, int]],
        block_quant_shape: list[int],
-        use_deep_gemm: bool,
    ) -> dict[str, int]:
        best_config = None
        best_time = float("inf")
@ -457,8 +436,7 @@ class BenchmarkWorker:
                        use_fp8_w8a8,
                        use_int8_w8a16,
                        num_iters=20,
-                        block_quant_shape=block_quant_shape,
-                        use_deep_gemm=use_deep_gemm)
+                        block_quant_shape=block_quant_shape)
                except triton.runtime.autotuner.OutOfResources:
                    # Some configurations may be invalid and fail to compile.
                    continue
@ -553,8 +531,6 @@ def main(args: argparse.Namespace):
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
-        # Support for llama4
-        config = config.get_text_config()
        # Default: Mixtral.
        E = config.num_local_experts
        topk = config.num_experts_per_tok
@ -574,8 +550,6 @@ def main(args: argparse.Namespace):
    else:
        batch_sizes = [args.batch_size]

-    use_deep_gemm = bool(args.use_deep_gemm)
-
    ray.init()
    num_gpus = int(ray.available_resources()["GPU"])
    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@ -598,10 +572,10 @@ def main(args: argparse.Namespace):

        start = time.time()
        configs = _distribute(
-            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
-                      block_quant_shape, use_deep_gemm)
-                     for batch_size in batch_sizes])
+            "tune",
+            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
+              use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
+             for batch_size in batch_sizes])
        best_configs = {
            M: sort_config(config)
            for M, config in zip(batch_sizes, configs)
@ -615,7 +589,7 @@ def main(args: argparse.Namespace):
        outputs = _distribute(
            "benchmark",
            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
+              use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
             for batch_size in batch_sizes])

        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@ -637,7 +611,6 @@ if __name__ == "__main__":
                        type=str,
                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
                        default="auto")
-    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--tune", action="store_true")
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -7,13 +7,10 @@ from typing import Optional
 import torch

 from vllm import _custom_ops as ops
-from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
                        create_kv_caches_with_random)

-logger = init_logger(__name__)
-
 NUM_BLOCKS = 128 * 1024
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
@ -196,9 +193,6 @@ def main(


 if __name__ == '__main__':
-    logger.warning("This script benchmarks the paged attention kernel. "
-                   "By default this is no longer used in vLLM inference.")
-
    parser = FlexibleArgumentParser(
        description="Benchmark the paged attention kernel.")
    parser.add_argument("--version",
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@ -75,19 +75,3 @@ WEIGHT_SHAPES = {
        [7168, 8192],
    ],
 }
-
-WEIGHT_SHAPES_MOE = {
-    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
-        [8, 2, 4096, 28672],
-        [8, 2, 14336, 4096],
-    ],
-    "nm-testing/deepseekv2-lite": [
-        [64, 6, 2048, 1408],
-    ],
-    "ibm-granite/granite-3.0-1b-a400m": [
-        [32, 8, 1024, 1024],
-    ],
-    "ibm-granite/granite-3.0-3b-a800m": [
-        [40, 8, 1024, 1536],
-    ],
-}
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -1,420 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Adapted from sglang quantization/tuning_block_wise_kernel.py
-
-import argparse
-import json
-import multiprocessing as mp
-import os
-import time
-from datetime import datetime
-from typing import Any
-
-import torch
-import tqdm
-import triton
-
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    _w8a8_block_fp8_matmul)
-from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
-
-mp.set_start_method("spawn", force=True)
-
-assert current_platform.is_cuda(
-), "Only support tune w8a8 block fp8 kernel on CUDA device."
-
-DTYPE_MAP = {
-    "float32": torch.float32,
-    "float16": torch.float16,
-    "half": torch.half,
-    "bfloat16": torch.bfloat16,
-}
-
-
-def w8a8_block_matmul(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    block_size: list[int],
-    config: dict[str, Any],
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-    """This function performs matrix multiplication with 
-    block-wise quantization.
-
-    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
-    The output is returned in the specified `output_dtype`.
-
-    Args:
-        A: The input tensor, e.g., activation.
-        B: The input tensor, e.g., weight.
-        As: The per-token-group quantization scale for `A`.
-        Bs: The per-block quantization scale for `B`.
-        block_size: The block size for per-block quantization. 
-                    It should be 2-dim, e.g., [128, 128].
-        output_dytpe: The dtype of the returned tensor.
-
-    Returns:
-        torch.Tensor: The result of matmul.
-    """
-    assert len(block_size) == 2
-    block_n, block_k = block_size[0], block_size[1]
-
-    assert A.shape[-1] == B.shape[-1]
-    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
-    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
-    M = A.numel() // A.shape[-1]
-
-    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
-    N, K = B.shape
-    assert triton.cdiv(N, block_n) == Bs.shape[0]
-    assert triton.cdiv(K, block_k) == Bs.shape[1]
-
-    C_shape = A.shape[:-1] + (N, )
-    C = A.new_empty(C_shape, dtype=output_dtype)
-
-    def grid(META):
-        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
-                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
-
-    if A.dtype == torch.float8_e4m3fn:
-        kernel = _w8a8_block_fp8_matmul
-    else:
-        raise RuntimeError(
-            "Currently, only support tune w8a8 block fp8 kernel.")
-
-    kernel[grid](
-        A,
-        B,
-        C,
-        As,
-        Bs,
-        M,
-        N,
-        K,
-        block_n,
-        block_k,
-        A.stride(-2),
-        A.stride(-1),
-        B.stride(1),
-        B.stride(0),
-        C.stride(-2),
-        C.stride(-1),
-        As.stride(-2),
-        As.stride(-1),
-        Bs.stride(1),
-        Bs.stride(0),
-        **config,
-    )
-
-    return C
-
-
-def get_configs_compute_bound():
-    configs = []
-    for num_stages in [2, 3, 4, 5]:
-        for block_m in [16, 32, 64, 128, 256]:
-            for block_k in [64, 128]:
-                for block_n in [32, 64, 128, 256]:
-                    for num_warps in [4, 8]:
-                        for group_size in [1, 16, 32, 64]:
-                            configs.append({
-                                "BLOCK_SIZE_M": block_m,
-                                "BLOCK_SIZE_N": block_n,
-                                "BLOCK_SIZE_K": block_k,
-                                "GROUP_SIZE_M": group_size,
-                                "num_warps": num_warps,
-                                "num_stages": num_stages,
-                            })
-    return configs
-
-
-def get_weight_shapes(tp_size):
-    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
-    # Modify them, if you tune for another different model.
-    # cannot TP
-    total = [
-        (512 + 64, 7168),
-        ((128 + 64) * 128, 7168),
-        (128 * (128 + 128), 512),
-        (7168, 16384),
-        (7168, 18432),
-    ]
-    # N can TP
-    n_tp = [
-        (18432 * 2, 7168),
-        ((128 + 64) * 128, 7168),
-        (128 * (128 + 128), 512),
-        (24576, 1536),
-        (12288, 7168),
-        (4096, 7168),
-    ]
-    # K can TP
-    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
-
-    weight_shapes = []
-    for t in total:
-        weight_shapes.append(t)
-    for n_t in n_tp:
-        new_t = (n_t[0] // tp_size, n_t[1])
-        weight_shapes.append(new_t)
-    for k_t in k_tp:
-        new_t = (k_t[0], k_t[1] // tp_size)
-        weight_shapes.append(new_t)
-    return weight_shapes
-
-
-def benchmark_config(A,
-                     B,
-                     As,
-                     Bs,
-                     block_size,
-                     config,
-                     out_dtype=torch.float16,
-                     num_iters=10):
-
-    def run():
-        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
-
-    torch.cuda.synchronize()
-    # JIT complication & warmup
-    for _ in range(5):
-        run()
-    torch.cuda.synchronize()
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    latencies: list[float] = []
-    for i in range(num_iters):
-        torch.cuda.synchronize()
-        start_event.record()
-        run()
-        end_event.record()
-        end_event.synchronize()
-        latencies.append(start_event.elapsed_time(end_event))
-    avg = sum(latencies) / (num_iters * 10) * 1000  # us
-    return avg
-
-
-def tune(M, N, K, block_size, out_dtype, search_space, input_type):
-    factor_for_scale = 1e-2
-
-    if input_type == "fp8":
-        fp8_info = torch.finfo(torch.float8_e4m3fn)
-        fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-        A_fp32 = (
-            (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
-            fp8_max)
-        A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-
-        B_fp32 = (
-            (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
-            fp8_max)
-        B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-    else:
-        raise RuntimeError(
-            "Currently, only support tune w8a8 block fp8 kernel.")
-
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles = (N + block_n - 1) // block_n
-    k_tiles = (K + block_k - 1) // block_k
-
-    As = torch.rand(M, k_tiles, dtype=torch.float32,
-                    device="cuda") * factor_for_scale
-    Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") *
-          factor_for_scale)
-
-    best_config = None
-    best_time = float("inf")
-    for config in tqdm(search_space):
-        try:
-            kernel_time = benchmark_config(
-                A,
-                B,
-                As,
-                Bs,
-                block_size,
-                config,
-                out_dtype,
-                num_iters=10,
-            )
-        except triton.runtime.autotuner.OutOfResources:
-            # Some configurations may be invalid and fail to compile.
-            continue
-
-        if kernel_time < best_time:
-            best_time = kernel_time
-            best_config = config
-    now = datetime.now()
-    print(f"{now.ctime()}] Completed tuning for batch_size={M}")
-    assert best_config is not None
-    return best_config
-
-
-def save_configs(
-    N,
-    K,
-    block_n,
-    block_k,
-    configs,
-    save_path,
-    input_type="fp8",
-) -> None:
-    os.makedirs(save_path, exist_ok=True)
-    device_name = current_platform.get_device_name().replace(" ", "_")
-    json_file_name = (
-        f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
-        f"block_shape=[{block_n},{block_k}].json")
-
-    config_file_path = os.path.join(save_path, json_file_name)
-    print(f"Writing best config to {config_file_path}...")
-
-    with open(config_file_path, "w") as f:
-        json.dump(configs, f, indent=4)
-        f.write("\n")
-
-
-def tune_on_gpu(args_dict):
-    """Run tuning on a specific GPU."""
-    gpu_id = args_dict["gpu_id"]
-    batch_sizes = args_dict["batch_sizes"]
-    weight_shapes = args_dict["weight_shapes"]
-    args = args_dict["args"]
-
-    torch.cuda.set_device(gpu_id)
-    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
-
-    block_n = args.block_n
-    block_k = args.block_k
-    out_dtype = DTYPE_MAP[args.out_dtype]
-    save_path = args.save_path
-    input_type = args.input_type
-
-    search_space = get_configs_compute_bound()
-    search_space = [
-        config for config in search_space
-        if block_k % config["BLOCK_SIZE_K"] == 0
-    ]
-
-    start = time.time()
-    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
-        N, K = shape[0], shape[1]
-        print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
-        benchmark_results = [
-            tune(
-                batch_size,
-                N,
-                K,
-                [block_n, block_k],
-                out_dtype,
-                search_space,
-                input_type,
-            ) for batch_size in tqdm(batch_sizes,
-                                     desc=f"GPU {gpu_id} - Batch sizes")
-        ]
-        best_configs = {
-            M: config
-            for M, config in zip(batch_sizes, benchmark_results)
-        }
-        save_configs(N, K, block_n, block_k, best_configs, save_path,
-                     input_type)
-
-    end = time.time()
-    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
-
-
-def distribute_batch_sizes(batch_sizes, num_gpus):
-    """Distribute batch sizes across available GPUs."""
-    batches_per_gpu = []
-    for i in range(num_gpus):
-        start_idx = i * len(batch_sizes) // num_gpus
-        end_idx = (i + 1) * len(batch_sizes) // num_gpus
-        batches_per_gpu.append(batch_sizes[start_idx:end_idx])
-    return batches_per_gpu
-
-
-def main(args):
-    print(args)
-    num_gpus = torch.cuda.device_count()
-    if num_gpus == 0:
-        raise RuntimeError("No GPU available for tuning")
-    print(f"Found {num_gpus} GPUs for parallel tuning")
-
-    torch.cuda.init()
-
-    if args.batch_size is None:
-        batch_sizes = [
-            1,
-            2,
-            4,
-            8,
-            16,
-            24,
-            32,
-            48,
-            64,
-            96,
-            128,
-            256,
-            512,
-            1024,
-            1536,
-            2048,
-            3072,
-            4096,
-        ]
-    else:
-        batch_sizes = [args.batch_size]
-        num_gpus = 1  # If only one batch size, use only one GPU
-
-    weight_shapes = get_weight_shapes(args.tp_size)
-
-    batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
-
-    process_args = []
-    for gpu_id in range(num_gpus):
-        process_args.append({
-            "gpu_id": gpu_id,
-            "batch_sizes": batches_per_gpu[gpu_id],
-            "weight_shapes":
-            weight_shapes,  # Each GPU processes all weight shapes
-            "args": args,
-        })
-
-    ctx = mp.get_context("spawn")
-    with ctx.Pool(num_gpus) as pool:
-        pool.map(tune_on_gpu, process_args)
-
-    print("Multi-GPU tuning completed")
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description="""
-Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
-    python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
-Then copy to model_executor/layers/quantization/utils/configs
-        """,
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--tp-size", "-tp", type=int, default=8)
-    parser.add_argument("--input-type",
-                        type=str,
-                        choices=["fp8"],
-                        default="fp8")
-    parser.add_argument(
-        "--out-dtype",
-        type=str,
-        choices=["float32", "float16", "bfloat16", "half"],
-        default="float16",
-    )
-    parser.add_argument("--block-n", type=int, default=128)
-    parser.add_argument("--block-k", type=int, default=128)
-    parser.add_argument("--batch-size", type=int, required=False)
-    parser.add_argument("--save-path", type=str, default="./")
-    args = parser.parse_args()
-
-    main(args)
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+PORT=8000
+MODEL=$1
+TOKENS=$2
+
+docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
+           -v "$PWD/data:/data" \
+           ghcr.io/huggingface/text-generation-inference:2.2.0 \
+           --model-id "$MODEL" \
+           --sharded false  \
+           --max-input-length 1024 \
+           --max-total-tokens 2048 \
+           --max-best-of 5 \
+           --max-concurrent-requests 5000 \
+           --max-batch-total-tokens "$TOKENS"
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -33,6 +33,8 @@ endif()

 if(MACOSX_FOUND)
    list(APPEND CXX_COMPILE_FLAGS
+        "-Xpreprocessor"
+        "-fopenmp"
        "-DVLLM_CPU_EXTENSION")
 else()
    list(APPEND CXX_COMPILE_FLAGS
@ -188,14 +190,12 @@ set(VLLM_EXT_SRC
    "csrc/cpu/cache.cpp"
    "csrc/cpu/utils.cpp"
    "csrc/cpu/layernorm.cpp"
-    "csrc/cpu/mla_decode.cpp"
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/torch_bindings.cpp")

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/quant.cpp"
-        "csrc/cpu/shm.cpp"
        ${VLLM_EXT_SRC})
 endif()

--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
+          GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@ -105,14 +105,8 @@ def run(command):
    else:
        enc = locale.getpreferredencoding()
    output = raw_output.decode(enc)
-    if command == 'nvidia-smi topo -m':
-        # don't remove the leading whitespace of `nvidia-smi topo -m`
-        #   because they are meaningful
-        output = output.rstrip()
-    else:
-        output = output.strip()
    err = raw_err.decode(enc)
-    return rc, output, err.strip()
+    return rc, output.strip(), err.strip()


 def run_and_read_all(run_lambda, command):
@ -282,20 +276,12 @@ def get_vllm_version():

    if __version__ == "dev":
        return "N/A (dev)"
-    version_str = __version_tuple__[-1]
-    if isinstance(version_str, str) and version_str.startswith('g'):
-        # it's a dev build
-        if '.' in version_str:
-            # it's a dev build containing local changes
-            git_sha = version_str.split('.')[0][1:]
-            date = version_str.split('.')[-1][1:]
-            return f"{__version__} (git sha: {git_sha}, date: {date})"
-        else:
-            # it's a dev build without local changes
-            git_sha = version_str[1:]  # type: ignore
-            return f"{__version__} (git sha: {git_sha})"
-    return __version__

+    if len(__version_tuple__) == 4: # dev build
+        git_sha = __version_tuple__[-1][1:] # type: ignore
+        return f"{__version__} (git sha: {git_sha}"
+
+    return __version__

 def summarize_vllm_build_flags():
    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
@ -496,30 +482,16 @@ def get_pip_packages(run_lambda, patterns=None):
    if patterns is None:
        patterns = DEFAULT_PIP_PATTERNS

-    def run_with_pip():
-        try:
-            import importlib.util
-            pip_spec = importlib.util.find_spec('pip')
-            pip_available = pip_spec is not None
-        except ImportError:
-            pip_available = False
-
-        if pip_available:
-            cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
-        elif os.environ.get("UV") is not None:
-            print("uv is set")
-            cmd = ["uv", "pip", "list", "--format=freeze"]
-        else:
-            raise RuntimeError(
-                "Could not collect pip list output (pip or uv module not available)"
-            )
-
-        out = run_and_read_all(run_lambda, cmd)
+    # People generally have `pip` as `pip` or `pip3`
+    # But here it is invoked as `python -mpip`
+    def run_with_pip(pip):
+        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
        return "\n".join(line for line in out.splitlines()
                         if any(name in line for name in patterns))

    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
-    out = run_with_pip()
+    out = run_with_pip([sys.executable, '-mpip'])
+
    return pip_version, out


@ -545,12 +517,13 @@ def is_xnnpack_available():
    else:
        return "N/A"

-
 def get_env_vars():
    env_vars = ''
-    secret_terms = ('secret', 'token', 'api', 'access', 'password')
-    report_prefix = ("TORCH", "NCCL", "PYTORCH", "CUDA", "CUBLAS", "CUDNN",
-                     "OMP_", "MKL_", "NVIDIA")
+    secret_terms=('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "NCCL", "PYTORCH",
+                     "CUDA", "CUBLAS", "CUDNN",
+                     "OMP_", "MKL_",
+                     "NVIDIA")
    for k, v in os.environ.items():
        if any(term in k.lower() for term in secret_terms):
            continue
@ -561,7 +534,6 @@ def get_env_vars():

    return env_vars

-
 def get_env_info():
    run_lambda = run
    pip_version, pip_list_output = get_pip_packages(run_lambda)
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@ -1,178 +0,0 @@
-#include <optional>
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <algorithm>
-
-#include "attention_dtypes.h"
-#include "attention_utils.cuh"
-
-namespace vllm {
-
-// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
-// can be used to combine partial attention results (in the split-KV case)
-template <typename scalar_t, const uint NUM_THREADS>
-__global__ void merge_attn_states_kernel(
-    scalar_t* output, float* output_lse, const scalar_t* prefix_output,
-    const float* prefix_lse, const scalar_t* suffix_output,
-    const float* suffix_lse, const uint num_tokens, const uint num_heads,
-    const uint head_size) {
-  using pack_128b_t = uint4;
-  const uint pack_size = 16 / sizeof(scalar_t);
-  const uint threads_per_head = head_size / pack_size;
-
-  const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
-  const uint token_head_threads = num_tokens * num_heads * threads_per_head;
-
-  if (global_idx >= token_head_threads) return;
-
-  // global_idx -> token_idx + head_idx + pack_idx
-  const uint token_head_idx = global_idx / threads_per_head;
-  const uint pack_idx = global_idx % threads_per_head;
-
-  const uint token_idx = token_head_idx / num_heads;
-  const uint head_idx = token_head_idx % num_heads;
-
-  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
-  const uint head_offset =
-      token_idx * num_heads * head_size + head_idx * head_size;
-  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
-  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
-  scalar_t* output_head_ptr = output + head_offset;
-
-  float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
-  float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
-  p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
-  s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
-
-  const float max_lse = fmaxf(p_lse, s_lse);
-  p_lse = p_lse - max_lse;
-  s_lse = s_lse - max_lse;
-  const float p_se = expf(p_lse);
-  const float s_se = expf(s_lse);
-  const float out_se = p_se + s_se;
-  const float p_scale = p_se / out_se;
-  const float s_scale = s_se / out_se;
-
-  if (pack_offset < head_size) {
-    // Pack 128b load
-    pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
-        prefix_head_ptr)[pack_offset / pack_size];
-    pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(
-        suffix_head_ptr)[pack_offset / pack_size];
-    pack_128b_t o_out_pack;
-
-#pragma unroll
-    for (uint i = 0; i < pack_size; ++i) {
-      // Always use float for FMA to keep high precision.
-      // half(uint16_t), bfloat16, float -> float.
-      const float p_out_f =
-          vllm::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
-      const float s_out_f =
-          vllm::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
-      // fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
-      const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
-      // float -> half(uint16_t), bfloat16, float.
-      vllm::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
-    }
-
-    // Pack 128b storage
-    reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
-        o_out_pack;
-  }
-  // We only need to write to output_lse once per head.
-  if (output_lse != nullptr && pack_idx == 0) {
-    float out_lse = logf(out_se) + max_lse;
-    output_lse[head_idx * num_tokens + token_idx] = out_lse;
-  }
-}
-
-}  // namespace vllm
-
-// The following macro is used to dispatch the conversion function based on
-// the output data type. The FN is a macro that calls a function with
-// template<typename scalar_t>.
-#define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn)                      \
-  {                                                                     \
-    if (scalar_dtype == at::ScalarType::Float) {                        \
-      fn(float);                                                        \
-    } else if (scalar_dtype == at::ScalarType::Half) {                  \
-      fn(uint16_t);                                                     \
-    } else if (scalar_dtype == at::ScalarType::BFloat16) {              \
-      fn(__nv_bfloat16);                                                \
-    } else {                                                            \
-      TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
-    }                                                                   \
-  }
-
-#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS)                     \
-  {                                                                         \
-    vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS>                   \
-        <<<grid, block, 0, stream>>>(                                       \
-            reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
-            reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),          \
-            reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
-            reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
-            reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
-            num_heads, head_size);                                          \
-  }
-
-/*@brief Merges the attention states from prefix and suffix
- * into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
- *
- * @param output [n,h,d] The output tensor to store the merged attention states.
- * @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
- * @param prefix_output [n,h,d] The prefix attention states.
- * @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
- * states.
- * @param suffix_output [n,h,d] The suffix attention states.
- * @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
- * states.
- */
-template <typename scalar_t>
-void merge_attn_states_launcher(torch::Tensor& output,
-                                std::optional<torch::Tensor> output_lse,
-                                const torch::Tensor& prefix_output,
-                                const torch::Tensor& prefix_lse,
-                                const torch::Tensor& suffix_output,
-                                const torch::Tensor& suffix_lse) {
-  constexpr uint NUM_THREADS = 128;
-  const uint num_tokens = output.size(0);
-  const uint num_heads = output.size(1);
-  const uint head_size = output.size(2);
-  const uint pack_size = 16 / sizeof(scalar_t);
-  TORCH_CHECK(head_size % pack_size == 0,
-              "headsize must be multiple of pack_size:", pack_size);
-  float* output_lse_ptr = nullptr;
-  if (output_lse.has_value()) {
-    output_lse_ptr = output_lse.value().data_ptr<float>();
-  }
-  // Process one pack elements per thread. for float, the
-  // pack_size is 4 for half/bf16, the pack_size is 8.
-  const uint threads_per_head = head_size / pack_size;
-  const uint total_threads = num_tokens * num_heads * threads_per_head;
-
-  dim3 block(NUM_THREADS);
-  dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
-
-  const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
-}
-
-#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t)                           \
-  {                                                                         \
-    merge_attn_states_launcher<scalar_t>(output, output_lse, prefix_output, \
-                                         prefix_lse, suffix_output,         \
-                                         suffix_lse);                       \
-  }
-
-void merge_attn_states(torch::Tensor& output,
-                       std::optional<torch::Tensor> output_lse,
-                       const torch::Tensor& prefix_output,
-                       const torch::Tensor& prefix_lse,
-                       const torch::Tensor& suffix_output,
-                       const torch::Tensor& suffix_lse) {
-  DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
-}
--- a/csrc/attention/mla/cutlass_mla_entry.cu
+++ b/csrc/attention/mla/cutlass_mla_entry.cu
@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <torch/all.h>
-
-#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
-void cutlass_mla_decode_sm100a(torch::Tensor const& out,
-                               torch::Tensor const& q_nope,
-                               torch::Tensor const& q_pe,
-                               torch::Tensor const& kv_c_and_k_pe_cache,
-                               torch::Tensor const& seq_lens,
-                               torch::Tensor const& page_table, double scale);
-#endif
-
-void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
-                        torch::Tensor const& q_pe,
-                        torch::Tensor const& kv_c_and_k_pe_cache,
-                        torch::Tensor const& seq_lens,
-                        torch::Tensor const& page_table, double scale) {
-#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
-  return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
-                                   seq_lens, page_table, scale);
-#endif
-  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
-}
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@ -1,225 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/cutlass.h"
-#include "cutlass/kernel_hardware_info.h"
-
-#include "cutlass_extensions/common.hpp"
-
-#include "device/sm100_mla.hpp"
-#include "kernel/sm100_mla_tile_scheduler.hpp"
-
-using namespace cute;
-using namespace cutlass::fmha::kernel;
-
-template <typename T, bool PersistenceOption = true>
-struct MlaSm100 {
-  using Element = T;
-  using ElementAcc = float;
-  using ElementOut = T;
-
-  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
-  using TileShapeH = cute::tuple_element_t<0, TileShape>;
-  using TileShapeD = cute::tuple_element_t<2, TileShape>;
-
-  // H K (D_latent D_rope) B
-  using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
-
-  using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
-  using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
-  using StrideO = StrideK;                            // H D B
-  using StrideLSE = cute::tuple<_1, int>;             // H B
-
-  using TileScheduler =
-      std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
-                         Sm100MlaIndividualTileScheduler>;
-
-  using FmhaKernel =
-      cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
-          TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
-          /*kIsCpAsync=*/true>;
-  using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
-};
-
-template <typename T>
-typename T::Fmha::Arguments args_from_options(
-    at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
-    at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
-    at::Tensor const& page_table, double scale) {
-  cutlass::KernelHardwareInfo hw_info;
-  hw_info.device_id = q_nope.device().index();
-  hw_info.sm_count =
-      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-          hw_info.device_id);
-
-  int batches = q_nope.sizes()[0];
-  int page_count_per_seq = page_table.sizes()[1];
-  int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
-  int page_size = kv_c_and_k_pe_cache.sizes()[1];
-  int max_seq_len = page_size * page_count_per_seq;
-  using TileShapeH = typename T::TileShapeH;
-  using TileShapeD = typename T::TileShapeD;
-  auto problem_shape =
-      cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
-
-  auto [H, K, D, B] = problem_shape;
-  auto [D_latent, D_rope] = D;
-
-  using StrideQ = typename T::StrideQ;
-  using StrideK = typename T::StrideK;
-  using StrideO = typename T::StrideO;
-  using StrideLSE = typename T::StrideLSE;
-
-  StrideQ stride_Q_latent = cute::make_tuple(
-      static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
-  StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
-                                           static_cast<int64_t>(H * D_rope));
-  StrideK stride_C =
-      cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
-                       static_cast<int64_t>(page_size * (D_latent + D_rope)));
-  StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
-  StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
-  StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
-                                      static_cast<int64_t>(H * D_latent));
-
-  using Element = typename T::Element;
-  using ElementOut = typename T::ElementOut;
-  using ElementAcc = typename T::ElementAcc;
-  auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
-  auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
-  auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
-  auto scale_f = static_cast<float>(scale);
-  typename T::Fmha::Arguments arguments{
-      problem_shape,
-      {scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
-       stride_C, C_ptr + D_latent, stride_C,
-       static_cast<int*>(seq_lens.data_ptr()),
-       static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
-       page_size},
-      {static_cast<ElementOut*>(out.data_ptr()), stride_O,
-       static_cast<ElementAcc*>(nullptr), stride_LSE},
-      hw_info,
-      -1,       // split_kv
-      nullptr,  // is_var_split_kv
-  };
-  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
-  // split_kv automatically based on batch size and sequence length to balance
-  // workload across available SMs. Consider using var_split_kv for manual
-  // control if needed.
-  T::Fmha::set_split_kv(arguments);
-  return arguments;
-}
-
-template <typename Element>
-void runMla(at::Tensor const& out, at::Tensor const& q_nope,
-            at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
-            at::Tensor const& seq_lens, at::Tensor const& page_table,
-            float scale, cudaStream_t stream) {
-  using MlaSm100Type = MlaSm100<Element>;
-  typename MlaSm100Type::Fmha fmha;
-  auto arguments = args_from_options<MlaSm100Type>(
-      out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
-  size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  CUTLASS_CHECK(fmha.can_implement(arguments));
-
-  CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
-
-  CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
-}
-
-void cutlass_mla_decode_sm100a(torch::Tensor const& out,
-                               torch::Tensor const& q_nope,
-                               torch::Tensor const& q_pe,
-                               torch::Tensor const& kv_c_and_k_pe_cache,
-                               torch::Tensor const& seq_lens,
-                               torch::Tensor const& page_table, double scale) {
-  TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
-  TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
-  TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
-  TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
-              "kv_c_and_k_pe_cache must be a 3D tensor");
-  TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
-  TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
-  TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
-
-  auto B_q_nope = q_nope.size(0);
-  auto H_q_nope = q_nope.size(1);
-  auto D_q_nope = q_nope.size(2);
-  auto B_q_pe = q_pe.size(0);
-  auto H_q_pe = q_pe.size(1);
-  auto D_q_pe = q_pe.size(2);
-  auto B_pt = page_table.size(0);
-  auto PAGE_NUM = page_table.size(1);
-  auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
-  auto D_ckv = kv_c_and_k_pe_cache.size(2);
-  auto B_o = out.size(0);
-  auto H_o = out.size(1);
-  auto D_o = out.size(2);
-
-  TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
-  TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
-  TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
-  TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
-              "H_q_nope, H_q_pe, and H_o must be equal to 128");
-  TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
-              "PAGE_SIZE must be a power of 2");
-  TORCH_CHECK(
-      B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
-      "Batch dims must be same for page_table, q_nope and q_pe, and out");
-  TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
-              "PAGE_NUM must be divisible by 128 / PAGE_SIZE");
-  TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
-
-  TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
-                  q_nope.dtype() == at::ScalarType::BFloat16 ||
-                  q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
-              "q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
-  TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
-                  q_nope.dtype() == q_pe.dtype(),
-              "kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
-  TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
-              "seq_lens must be a 32-bit integer tensor");
-  TORCH_CHECK(page_table.dtype() == torch::kInt32,
-              "page_table must be a 32-bit integer tensor");
-
-  auto in_dtype = q_nope.dtype();
-  at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
-  const cudaStream_t stream =
-      at::cuda::getCurrentCUDAStream(q_nope.get_device());
-  if (in_dtype == at::ScalarType::Half) {
-    runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
-                            page_table, scale, stream);
-  } else if (in_dtype == at::ScalarType::BFloat16) {
-    runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
-                                seq_lens, page_table, scale, stream);
-  } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
-    runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
-                                  seq_lens, page_table, scale, stream);
-  } else {
-    TORCH_CHECK(false, "Unsupported input data type of MLA");
-  }
-}
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -270,10 +270,9 @@ __global__ void reshape_and_cache_flash_kernel(
    cache_t* __restrict__ value_cache,   // [num_blocks, block_size, num_heads,
                                         // head_size]
    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int64_t block_stride, const int64_t page_stride,
-    const int64_t head_stride, const int64_t key_stride,
-    const int64_t value_stride, const int num_heads, const int head_size,
-    const int block_size, const float* k_scale, const float* v_scale) {
+    const int block_stride, const int key_stride, const int value_stride,
+    const int num_heads, const int head_size, const int block_size,
+    const float* k_scale, const float* v_scale) {
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  // NOTE: slot_idx can be -1 if the token is padded
@ -289,8 +288,8 @@ __global__ void reshape_and_cache_flash_kernel(
    const int head_idx = i / head_size;
    const int head_offset = i % head_size;
    const int64_t tgt_key_value_idx = block_idx * block_stride +
-                                      block_offset * page_stride +
-                                      head_idx * head_stride + head_offset;
+                                      block_offset * num_heads * head_size +
+                                      head_idx * head_size + head_offset;
    scalar_t tgt_key = key[src_key_idx];
    scalar_t tgt_value = value[src_value_idx];
    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
@ -397,16 +396,16 @@ void reshape_and_cache(
 // KV_T is the data type of key and value tensors.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)             \
-  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>           \
-      <<<grid, block, 0, stream>>>(                                       \
-          reinterpret_cast<KV_T*>(key.data_ptr()),                        \
-          reinterpret_cast<KV_T*>(value.data_ptr()),                      \
-          reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),               \
-          reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),             \
-          slot_mapping.data_ptr<int64_t>(), block_stride, page_stride,    \
-          head_stride, key_stride, value_stride, num_heads, head_size,    \
-          block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
+#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
+  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
+      <<<grid, block, 0, stream>>>(                                   \
+          reinterpret_cast<KV_T*>(key.data_ptr()),                    \
+          reinterpret_cast<KV_T*>(value.data_ptr()),                  \
+          reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),           \
+          reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),         \
+          slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
+          value_stride, num_heads, head_size, block_size,             \
+          reinterpret_cast<const float*>(k_scale.data_ptr()),         \
          reinterpret_cast<const float*>(v_scale.data_ptr()));

 void reshape_and_cache_flash(
@ -433,11 +432,9 @@ void reshape_and_cache_flash(
  int head_size = key.size(2);
  int block_size = key_cache.size(1);

-  int64_t key_stride = key.stride(0);
-  int64_t value_stride = value.stride(0);
-  int64_t block_stride = key_cache.stride(0);
-  int64_t page_stride = key_cache.stride(1);
-  int64_t head_stride = key_cache.stride(2);
+  int key_stride = key.stride(0);
+  int value_stride = value.stride(0);
+  int block_stride = key_cache.stride(0);
  TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));

  dim3 grid(num_tokens);
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@ -88,48 +88,6 @@ void reshape_and_cache_cpu_impl(
 }
 };  // namespace

-template <typename scalar_t>
-void concat_and_cache_mla_cpu_impl(
-    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
-    scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                      // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int num_tokens,                      //
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size                       //
-) {
-#pragma omp parallel for
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    const int64_t slot_idx = slot_mapping[token_idx];
-    // NOTE: slot_idx can be -1 if the token is padded
-    if (slot_idx < 0) {
-      continue;
-    }
-    const int64_t block_idx = slot_idx / block_size;
-    const int64_t block_offset = slot_idx % block_size;
-
-    auto copy = [&](const scalar_t* __restrict__ src,
-                    scalar_t* __restrict__ dst, int src_stride, int dst_stride,
-                    int size, int offset) {
-      for (int i = 0; i < size; i++) {
-        const int64_t src_idx = token_idx * src_stride + i;
-        const int64_t dst_idx =
-            block_idx * block_stride + block_offset * entry_stride + i + offset;
-        dst[dst_idx] = src[src_idx];
-      }
-    };
-
-    copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
-    copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-  }
-}
-
 // Note: the key_caches and value_caches vectors are constant but
 // not the Tensors they contain. The vectors need to be const refs
 // in order to satisfy pytorch's C++ operator registration code.
@ -176,38 +134,6 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
  });
 }

-void concat_and_cache_mla(
-    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
-    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
-                                  // pe_dim)]
-    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& scale) {
-  int num_tokens = slot_mapping.size(0);
-  int kv_lora_rank = kv_c.size(1);
-  int pe_dim = k_pe.size(1);
-  int block_size = kv_cache.size(1);
-
-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-  TORCH_CHECK(kv_cache_dtype != "fp8");
-
-  int kv_c_stride = kv_c.stride(0);
-  int k_pe_stride = k_pe.stride(0);
-  int block_stride = kv_cache.stride(0);
-  int entry_stride = kv_cache.stride(1);
-
-  VLLM_DISPATCH_FLOATING_TYPES(
-      kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
-        concat_and_cache_mla_cpu_impl<scalar_t>(
-            kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
-            kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
-            num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
-            kv_lora_rank, pe_dim, block_size);
-        CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
-      });
-}
-
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping) {
  TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -78,14 +78,9 @@ struct FP16Vec16 : public Vec<FP16Vec16> {

  __m256i reg;

-  // normal load
  explicit FP16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}

-  // non-temproal load
-  explicit FP16Vec16(bool, void* ptr)
-      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
-
  explicit FP16Vec16(const FP32Vec16&);

  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@ -115,14 +110,9 @@ struct BF16Vec16 : public Vec<BF16Vec16> {

  __m256i reg;

-  // normal load
  explicit BF16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}

-  // non-temproal load
-  explicit BF16Vec16(bool, void* ptr)
-      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
-
  explicit BF16Vec16(const FP32Vec16&);

  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@ -140,8 +130,6 @@ struct BF16Vec32 : public Vec<BF16Vec32> {

  __m512i reg;

-  explicit BF16Vec32() : reg(_mm512_setzero_si512()) {}
-
  explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}

  explicit BF16Vec32(__m512i data) : reg(data) {}
@ -323,13 +311,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {

  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}

-  // normal load
  explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}

-  // non-temproal load
-  explicit FP32Vec16(bool, void* ptr)
-      : reg((__m512)_mm512_stream_load_si512(ptr)) {}
-
  explicit FP32Vec16(__m512 data) : reg(data) {}

  explicit FP32Vec16(const FP32Vec4& data)
@ -562,33 +545,6 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
    _mm_mask_storeu_epi8(ptr, mask, reg);
  }
 };
-
-struct INT8Vec64 : public Vec<INT8Vec64> {
-  constexpr static int VEC_ELEM_NUM = 64;
-  union AliasReg {
-    __m512i reg;
-    int8_t values[VEC_ELEM_NUM];
-  };
-
-  __m512i reg;
-
-  // normal load
-  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
-
-  // non-temproal load
-  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
-
-  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
-
-  void save(int8_t* ptr, const int elem_num) const {
-    constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
-    __mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
-    _mm512_mask_storeu_epi8(ptr, mask, reg);
-  }
-
-  // non-temproal save
-  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
-};
 #endif

 template <typename T>
@ -699,22 +655,6 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {

 inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }

-#ifdef __AVX512F__
-inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
-  _mm256_stream_si256((__m256i*)ptr, vec.reg);
-}
-inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
-  _mm512_stream_si512((__m512i*)ptr, vec.reg);
-}
-inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
-  _mm256_stream_si256((__m256i*)ptr, vec.reg);
-}
-inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
-  _mm512_stream_ps((float*)ptr, vec.reg);
-}
-#endif
-
-inline void mem_barrier() { _mm_mfence(); }
 };  // namespace vec_op

 #endif
--- a/csrc/cpu/mla_decode.cpp
+++ b/csrc/cpu/mla_decode.cpp
@ -1,393 +0,0 @@
-#include "cpu_types.hpp"
-#include <float.h>
-
-namespace {
-template <typename scalar_t>
-struct KernelVecType {
-  using qk_load_vec_type = void;
-  using qk_vec_type = void;
-  using v_load_vec_type = void;
-};
-
-template <>
-struct KernelVecType<float> {
-  using qk_load_vec_type = vec_op::FP32Vec16;
-  using qk_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP32Vec16;
-};
-
-template <>
-struct KernelVecType<c10::Half> {
-#if defined(__powerpc64__) || defined(__s390x__)
-  // Power and s390x architecture-specific vector types
-  using qk_load_vec_type = vec_op::FP32Vec16;
-  using qk_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP32Vec16;
-#else
-  // Fallback for other architectures, including x86
-  using qk_load_vec_type = vec_op::FP16Vec16;
-  using qk_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP16Vec16;
-#endif
-};
-
-#ifdef __AVX512BF16__
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using qk_load_vec_type = vec_op::BF16Vec32;
-  using qk_vec_type = vec_op::BF16Vec32;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
-// pass
-#else
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using qk_load_vec_type = vec_op::BF16Vec16;
-  using qk_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-#endif
-
-template <int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE, int HEAD_UNROLL,
-          typename qk_vec_type>
-void mla_decode_block_head(
-    const qk_vec_type* __restrict__ q_vecs,          // [HEAD_UNROLL, head_dim]
-    const qk_vec_type* __restrict__ k_vecs,          // [block_size, head_dim]
-    const vec_op::FP32Vec16* __restrict v_vecs_f32,  // [block_size, v_head_dim]
-    float* __restrict__ acc_out,  // [HEAD_UNROLL, v_head_dim]
-    float* __restrict__ acc_lse,  // [HEAD_UNROLL]
-    const float scale, const int num_tokens) {
-  using f32_vec_type = vec_op::FP32Vec16;
-  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
-  constexpr int V_NUM_ELEM = f32_vec_type::VEC_ELEM_NUM;
-
-  float logits[BLOCK_SIZE][HEAD_UNROLL] = {};  // initialize to zeros
-  float max_val[HEAD_UNROLL];
-  std::fill(max_val, max_val + HEAD_UNROLL, -FLT_MAX);
-
-  f32_vec_type acc_vec[BLOCK_SIZE][HEAD_UNROLL];
-  for (int i = 0; i < HEAD_DIM; i += QK_NUM_ELEM) {
-    // load to registers
-    qk_vec_type q_vec[HEAD_UNROLL];
-
-#pragma unroll
-    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
-      q_vec[unroll] =
-          qk_vec_type{q_vecs[(i + unroll * HEAD_DIM) / QK_NUM_ELEM]};
-
-    for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
-      qk_vec_type k_vec(k_vecs[(block_offset * HEAD_DIM + i) / QK_NUM_ELEM]);
-
-#pragma unroll
-      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
-        vec_op::fma(acc_vec[block_offset][unroll], q_vec[unroll], k_vec);
-    }
-  }
-
-  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
-#pragma unroll
-    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
-      const float acc = acc_vec[block_offset][unroll].reduce_sum() * scale;
-      logits[block_offset][unroll] = acc;
-      max_val[unroll] = std::max(max_val[unroll], acc);
-    }
-  }
-
-  float sum_exp[HEAD_UNROLL] = {};
-  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
-#pragma unroll
-    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
-      const float val =
-          std::exp(logits[block_offset][unroll] - max_val[unroll]);
-      logits[block_offset][unroll] = val;
-      sum_exp[unroll] += val;
-    }
-  }
-
-  f32_vec_type this_out[V_HEAD_DIM / V_NUM_ELEM][HEAD_UNROLL];
-
-  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
-    // load to registers
-    f32_vec_type scale_[HEAD_UNROLL];
-
-#pragma unroll
-    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
-      scale_[unroll] =
-          f32_vec_type{logits[block_offset][unroll] / sum_exp[unroll]};
-
-    for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
-      f32_vec_type v_vec(
-          v_vecs_f32[(block_offset * HEAD_DIM + i) / V_NUM_ELEM]);
-
-#pragma unroll
-      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
-        vec_op::fma(this_out[i / V_NUM_ELEM][unroll], v_vec, scale_[unroll]);
-    }
-  }
-
-  // merge attention state
-  // section 2.2 in https://arxiv.org/pdf/2501.01005
-  f32_vec_type prev_scale[HEAD_UNROLL];
-  f32_vec_type curr_scale[HEAD_UNROLL];
-
-#pragma unroll
-  for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
-    const float prev_lse = acc_lse[unroll];
-    const float curr_lse = std::log(sum_exp[unroll]) +
-                           max_val[unroll];  // add back max_val to get true lse
-    // softmax trick
-    const float max_lse = std::max(prev_lse, curr_lse);
-    const float prev_sum_exp = std::exp(prev_lse - max_lse);
-    const float curr_sum_exp = std::exp(curr_lse - max_lse);
-
-    const float new_sum_exp = prev_sum_exp + curr_sum_exp;
-    acc_lse[unroll] = std::log(new_sum_exp) + max_lse;
-
-    prev_scale[unroll] = f32_vec_type{prev_sum_exp / new_sum_exp};
-    curr_scale[unroll] = f32_vec_type{curr_sum_exp / new_sum_exp};
-  }
-
-  for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
-#pragma unroll
-    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
-      f32_vec_type o_vec(acc_out + i + V_HEAD_DIM * unroll);
-      o_vec = o_vec * prev_scale[unroll] +
-              this_out[i / V_NUM_ELEM][unroll] * curr_scale[unroll];
-      o_vec.save(acc_out + i + V_HEAD_DIM * unroll);
-    }
-  }
-
-  q_vecs += HEAD_DIM / QK_NUM_ELEM * HEAD_UNROLL;
-  acc_out += V_HEAD_DIM * HEAD_UNROLL;
-}
-
-template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE,
-          typename qk_vec_type>
-void mla_decode_block(
-    const qk_vec_type* __restrict__ q_vecs,  // [num_heads, head_dim]
-    const scalar_t* __restrict__ kv_cache,   // [block_size, head_dim]
-    float* __restrict__ acc_out,             // [num_heads, v_head_dim]
-    float* __restrict__ acc_lse,             // [num_heads]
-    const int num_heads, const float scale, const int num_tokens) {
-  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
-  static_assert(
-      std::is_same<qk_vec_type,
-                   typename KernelVecType<scalar_t>::qk_vec_type>::value);
-  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
-  using f32_vec_type = vec_op::FP32Vec16;
-  static_assert(qk_load_vec_type::VEC_ELEM_NUM == qk_vec_type::VEC_ELEM_NUM);
-  static_assert(v_load_vec_type::VEC_ELEM_NUM == f32_vec_type::VEC_ELEM_NUM);
-  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
-  constexpr int V_NUM_ELEM = v_load_vec_type::VEC_ELEM_NUM;
-
-  const qk_vec_type* k_vecs;
-  const f32_vec_type* v_vecs_f32;
-  float* kv_cache_f32 = nullptr;
-
-  if constexpr (!std::is_same<scalar_t, float>::value) {
-    // convert KV cache block to FP32 to reuse it across query heads and
-    // attn @ V computation, since FP16/BF16->FP32 is expensive.
-    // TODO: move malloc outside of this fn to reuse across iterations.
-    const int nbytes = BLOCK_SIZE * HEAD_DIM * sizeof(float);
-    kv_cache_f32 = static_cast<float*>(std::aligned_alloc(64, nbytes));
-
-    for (int block_offset = 0; block_offset < num_tokens; ++block_offset)
-      for (int i = 0; i < HEAD_DIM; i += V_NUM_ELEM) {
-        v_load_vec_type kv_load_vec(kv_cache + block_offset * HEAD_DIM + i);
-        f32_vec_type kv_vec_f32(kv_load_vec);
-        kv_vec_f32.save(kv_cache_f32 + block_offset * HEAD_DIM + i);
-      }
-
-    if constexpr (std::is_same<qk_load_vec_type, qk_vec_type>::value) {
-      // for AVX512_BF16, Q @ K.T uses BF16 for K (no conversion)
-      // NOTE: in this case, we only need to convert the V section to FP32.
-      // But for simplicity, we will convert the whole KV block to FP32.
-      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
-    } else {
-      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache_f32);
-    }
-
-    // attn @ V always use FP32 for V, since attn is FP32.
-    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache_f32);
-
-  } else {
-    // KV cache is FP32. don't need to do anything.
-    k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
-    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache);
-  }
-
-  // compute 2 heads at the same time to improve ILP and
-  // take advantage of register cache for K and V.
-  constexpr int HEAD_UNROLL = 2;
-  for (int iter = 0; iter < num_heads / HEAD_UNROLL; ++iter) {
-    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, HEAD_UNROLL>(
-        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
-
-    q_vecs += HEAD_UNROLL * HEAD_DIM / QK_NUM_ELEM;
-    acc_out += HEAD_UNROLL * V_HEAD_DIM;
-    acc_lse += HEAD_UNROLL;
-  }
-
-  // take care of the remaining heads
-  for (int iter = 0; iter < num_heads % HEAD_UNROLL; ++iter) {
-    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, 1>(
-        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
-
-    q_vecs += HEAD_DIM / QK_NUM_ELEM;
-    acc_out += V_HEAD_DIM;
-    acc_lse += 1;
-  }
-
-  if (kv_cache_f32 != nullptr) {
-    std::free(kv_cache_f32);
-  }
-}
-}  // namespace
-
-template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE>
-void mla_decode_kvcache_cpu_impl(
-    scalar_t* __restrict__ out,             // [num_seqs, num_heads, v_head_dim]
-    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_dim]
-    const scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size,
-                                            // head_dim]
-    const int num_heads, const float scale,
-    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ seq_lens,      // [num_seqs]
-    const int max_num_blocks_per_seq, const int o_stride, const int q_stride,
-    const int kv_stride, const int num_seqs) {
-  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
-  using qk_vec_type = typename KernelVecType<scalar_t>::qk_vec_type;
-  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
-
-  // shared across threads
-  const int max_threads = omp_get_max_threads();
-  const int acc_out_nbytes =
-      max_threads * num_heads * V_HEAD_DIM * sizeof(float);
-  float* acc_out = static_cast<float*>(std::aligned_alloc(64, acc_out_nbytes));
-  std::vector<float> acc_lse(max_threads * num_heads);
-
-  // allocate memory to pre-convert query to FP32 later
-  float* q_f32;
-  constexpr bool PRE_CONVERT_QUERY =
-      !std::is_same<scalar_t, float>::value &&
-      std::is_same<qk_vec_type, vec_op::FP32Vec16>::value;
-  if constexpr (PRE_CONVERT_QUERY) {
-    const int q_f32_nbytes = num_heads * HEAD_DIM * sizeof(float);
-    q_f32 = static_cast<float*>(std::aligned_alloc(64, q_f32_nbytes));
-  }
-
-#pragma omp parallel
-  {
-    const int num_threads = omp_get_num_threads();
-    const int thread_id = omp_get_thread_num();
-    float* __restrict__ acc_out_thread =
-        acc_out + thread_id * num_heads * V_HEAD_DIM;
-    float* __restrict__ acc_lse_thread = acc_lse.data() + thread_id * num_heads;
-
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      // reset accumulator
-      std::fill(acc_out_thread, acc_out_thread + num_heads * V_HEAD_DIM, 0.0f);
-      std::fill(acc_lse_thread, acc_lse_thread + num_heads, -FLT_MAX);
-
-      const int seq_len = seq_lens[seq_idx];
-      const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      const int last_block_size = seq_len - (block_num - 1) * BLOCK_SIZE;
-
-      const qk_vec_type* q_vecs;
-      if constexpr (PRE_CONVERT_QUERY) {
-// pre-convert query to FP32 since FP16/BF16->FP32 is slow.
-#pragma omp for
-        for (int i = 0; i < num_heads * HEAD_DIM; i += QK_NUM_ELEM) {
-          qk_load_vec_type q_load_vec(q + seq_idx * q_stride + i);
-          qk_vec_type q_vec(q_load_vec);
-          q_vec.save(q_f32 + i);
-        }
-        q_vecs = reinterpret_cast<const qk_vec_type*>(q_f32);
-      } else {
-        q_vecs = reinterpret_cast<const qk_vec_type*>(q + seq_idx * q_stride);
-      }
-
-#pragma omp for
-      for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-        const int physical_block_idx =
-            block_tables[seq_idx * max_num_blocks_per_seq + block_idx];
-        const int num_tokens =
-            block_idx < block_num - 1 ? BLOCK_SIZE : last_block_size;
-
-        mla_decode_block<scalar_t, HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE>(
-            q_vecs, kv_cache + physical_block_idx * kv_stride, acc_out_thread,
-            acc_lse_thread, num_heads, scale, num_tokens);
-      }
-
-// merge attention states across threads
-// section 2.2 in https://arxiv.org/pdf/2501.01005
-// each thread is responsible for 1 head
-#pragma omp for
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        float* acc_lse_head = acc_lse.data() + head_idx;
-        float* acc_out_head = acc_out + head_idx * V_HEAD_DIM;
-
-        float max_val = -FLT_MAX;
-        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
-          max_val = std::max(max_val, acc_lse_head[thread_id_ * num_heads]);
-        }
-
-        float sum_exp = 0.0f;
-        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
-          float val = std::exp(acc_lse_head[thread_id_ * num_heads] - max_val);
-          acc_lse_head[thread_id_ * num_heads] = val;
-          sum_exp += val;
-        }
-
-        float inv_sum = 1.0f / sum_exp;
-        float out_head[V_HEAD_DIM] = {};
-        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
-          float scale_ = acc_lse_head[thread_id_ * num_heads] * inv_sum;
-          for (int i = 0; i < V_HEAD_DIM; ++i) {
-            out_head[i] +=
-                acc_out_head[thread_id_ * num_heads * V_HEAD_DIM + i] * scale_;
-          }
-        }
-
-        for (int i = 0; i < V_HEAD_DIM; ++i) {
-          vec_op::storeFP32(out_head[i], out + seq_idx * o_stride +
-                                             head_idx * V_HEAD_DIM + i);
-        }
-      }
-    }
-  }
-  if (PRE_CONVERT_QUERY) {
-    std::free(q_f32);
-  }
-  std::free(acc_out);
-}
-
-void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
-                        torch::Tensor& kv_cache, double scale,
-                        torch::Tensor& block_tables, torch::Tensor& seq_lens) {
-  const int num_seqs = query.size(0);
-  const int num_heads = query.size(1);
-  const int head_dim = query.size(2);
-  const int block_size = kv_cache.size(1);
-  const int v_head_dim = out.size(2);
-
-  const int max_num_blocks_per_seq = block_tables.size(1);
-  const int o_stride = out.stride(0);
-  const int q_stride = query.stride(0);
-  const int kv_stride = kv_cache.stride(0);
-
-  VLLM_DISPATCH_FLOATING_TYPES(
-      query.scalar_type(), "mla_decode_kvcache_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(mla_decode_kvcache_cpu_impl)
-        if (head_dim == 576 && v_head_dim == 512 && block_size == 16)
-          mla_decode_kvcache_cpu_impl<scalar_t, 576, 512, 16>(
-              out.data_ptr<scalar_t>(), query.data_ptr<scalar_t>(),
-              kv_cache.data_ptr<scalar_t>(), num_heads, scale,
-              block_tables.data_ptr<int>(), seq_lens.data_ptr<int>(),
-              max_num_blocks_per_seq, o_stride, q_stride, kv_stride, num_seqs);
-        else
-          TORCH_CHECK(false, "Unsupported block size: ", block_size);
-        CPU_KERNEL_GUARD_OUT(mla_decode_kvcache_cpu_impl)
-      });
-}
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@ -1,781 +0,0 @@
-#include "cpu/cpu_types.hpp"
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-namespace {
-#define MAX_SHM_RANK_NUM 8
-#define MAX_THREAD_NUM 12
-#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
-#define MIN_THREAD_PROCESS_SIZE (8 * 1024)
-#define MAX_P2P_SEND_TENSOR_NUM 8
-
-template <typename scalar_t>
-struct KernelVecType {
-  using scalar_vec_t = void;
-};
-
-template <>
-struct KernelVecType<float> {
-  using scalar_vec_t = vec_op::FP32Vec16;
-};
-
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using scalar_vec_t = vec_op::BF16Vec16;
-};
-
-template <>
-struct KernelVecType<c10::Half> {
-  using scalar_vec_t = vec_op::FP16Vec16;
-};
-
-enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
-
-struct ThreadSHMContext {
-  volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
-  int thread_id;
-  int thread_num;
-  int rank;
-  int group_size;
-  size_t _spinning_count;
-  int swizzled_ranks[MAX_SHM_RANK_NUM];
-  void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
-  ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
-
-  ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
-                   const int group_size, void* thread_shm_ptr)
-      : thread_id(thread_id),
-        thread_num(thread_num),
-        rank(rank),
-        group_size(group_size),
-        _spinning_count(0) {
-    static_assert(sizeof(ThreadSHMContext) % 64 == 0);
-    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
-    TORCH_CHECK((size_t)this % 64 == 0);
-    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
-    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
-      shm_contexts[i] = nullptr;
-      thread_shm_ptrs[i] = nullptr;
-      swizzled_ranks[i] = (i + rank) % group_size;
-      thread_stats[i] = ThreadSHMStat::DONE;
-    }
-    set_context(rank, this, thread_shm_ptr);
-  }
-
-  void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
-    TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
-    TORCH_CHECK(ptr);
-    TORCH_CHECK(thread_shm_ptr);
-    TORCH_CHECK_EQ(ptr->thread_num, thread_num);
-    TORCH_CHECK_EQ(ptr->thread_id, thread_id);
-    shm_contexts[rank] = ptr;
-    thread_shm_ptrs[rank] = thread_shm_ptr;
-  }
-
-  template <typename T>
-  T* get_thread_shm_ptr(int rank) {
-    return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
-  }
-
-  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
-
-  void wait_for_all(ThreadSHMStat prev_stat) {
-    for (int idx = 0; idx < group_size; ++idx) {
-      int rank = get_swizzled_rank(idx);
-      while (thread_stats[rank] == prev_stat) {
-        ++_spinning_count;
-        _mm_pause();
-      }
-    }
-    vec_op::mem_barrier();
-  }
-
-  void wait_for_one(int rank, ThreadSHMStat prev_stat) {
-    while (thread_stats[rank] == prev_stat) {
-      ++_spinning_count;
-      _mm_pause();
-    }
-    vec_op::mem_barrier();
-  }
-
-  void set_thread_stat(ThreadSHMStat stat) {
-    for (int idx = 0; idx < group_size; ++idx) {
-      int rank = get_swizzled_rank(idx);
-      shm_contexts[rank]->thread_stats[this->rank] = stat;
-    }
-  }
-
-  void set_thread_stat(int target_rank, ThreadSHMStat stat) {
-    for (int idx = 0; idx < group_size; ++idx) {
-      int rank = get_swizzled_rank(idx);
-      shm_contexts[rank]->thread_stats[target_rank] = stat;
-    }
-  }
-
-  // barrier for all ranks in the group, used for all2all ops
-  // DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
-  void barrier(ThreadSHMStat next_stat) {
-    if (next_stat == ThreadSHMStat::THREAD_READY) {
-      set_thread_stat(ThreadSHMStat::THREAD_READY);
-      wait_for_all(ThreadSHMStat::DONE);
-    } else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
-      set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
-      wait_for_all(ThreadSHMStat::THREAD_READY);
-    } else if (next_stat == ThreadSHMStat::DONE) {
-      set_thread_stat(ThreadSHMStat::DONE);
-      wait_for_all(ThreadSHMStat::SHM_DATA_READY);
-    } else {
-      TORCH_CHECK(false, "Invalid next_stat to barrier.");
-    }
-  }
-
-  std::string to_string() const {
-    std::stringstream ss;
-    ss << "SHMContext:";
-    ss << "\nrank: " << rank;
-    ss << "\ngroup_size: " << group_size;
-    ss << "\nthread_num: " << thread_num;
-    ss << "\nthread_id: " << thread_id;
-
-    ss << "\nshm_ctx_stat_loop_seq: [";
-    for (int i = 0; i < group_size; ++i) {
-      ss << swizzled_ranks[i] << ", ";
-    }
-    ss << "]";
-
-    ss << "\nshm_contexts: [";
-    for (int i = 0; i < group_size; ++i) {
-      if (shm_contexts[i]) {
-        ss << shm_contexts[i]->rank << ", ";
-      }
-    }
-    ss << "]";
-
-    return ss.str();
-  }
-};
-
-class SHMManager {
- public:
-  explicit SHMManager(const std::string& name, const int rank,
-                      const int group_size)
-      : _rank(rank),
-        _group_size(group_size),
-        _thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
-        _shm_names({""}),
-        _shared_mem_ptrs({nullptr}),
-        _shm_ctx(nullptr) {
-    _shm_names[rank] = get_shm_name(name, rank);
-    _shared_mem_ptrs[rank] = init_shm(rank);
-    _shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
-
-    for (int i = 0; i < _thread_num; ++i) {
-      ThreadSHMContext* ctx = new (_shm_ctx + i)
-          ThreadSHMContext(i, _thread_num, _rank, _group_size,
-                           compute_thread_shm_ptr(_shm_ctx, i));
-    }
-  }
-
-  void join(const std::string& name) {
-    for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
-      if (rank_idx != _rank) {
-        TORCH_CHECK(_shm_names[rank_idx].empty());
-        TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
-        _shm_names[rank_idx] = get_shm_name(name, rank_idx);
-        _shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
-        ThreadSHMContext* target_ctx =
-            reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
-        for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
-          _shm_ctx[thread_idx].set_context(
-              rank_idx, target_ctx + thread_idx,
-              compute_thread_shm_ptr(target_ctx, thread_idx));
-        }
-      }
-    }
-  }
-
-  ~SHMManager() { destroy_shm(); }
-
-  ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
-
-  static std::string get_shm_name(const std::string& name, int rank) {
-    return name + "_" + std::to_string(rank);
-  }
-
-  static int64_t create_singleton_instance(const std::string& name,
-                                           const int group_size,
-                                           const int rank) {
-    std::lock_guard<std::mutex> guard(SingletonInstancesLock);
-    SingletonInstances.emplace_back(
-        std::make_unique<SHMManager>(name, rank, group_size));
-    return static_cast<int64_t>(SingletonInstances.size() - 1);
-  }
-
-  static SHMManager* get_singleton_instance(int64_t handle) {
-    return SingletonInstances[handle].get();
-  }
-
- protected:
-  static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
-  static std::mutex SingletonInstancesLock;
-
- private:
-  static size_t round_to_alignment(size_t num) {
-    return ((num + 63) / 64) * 64;
-  }
-
-  int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
-    int8_t* thread_shm_ptr =
-        reinterpret_cast<int8_t*>(ctx) +
-        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
-    return thread_shm_ptr +
-           thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
-  }
-
-  size_t compute_shm_size() {
-    const size_t rounded_rank_buffer_size =
-        round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
-    const size_t rounded_thread_shm_ctx_size =
-        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
-    const size_t shm_size =
-        rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
-    return shm_size;
-  }
-
-  void* init_shm(int target_rank) {
-    const std::string& shm_name = _shm_names[target_rank];
-    const int local_rank = _rank;
-    const size_t shm_size = compute_shm_size();
-
-    int fd = -1;
-    if (local_rank == target_rank) {
-      fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
-                    S_IRUSR | S_IWUSR);
-
-      if (fd == -1)
-        TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
-                               std::to_string(errno));
-
-      if (ftruncate(fd, shm_size) == -1)
-        TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
-                               std::to_string(errno));
-    } else {
-      fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
-
-      if (fd == -1)
-        TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
-                               std::to_string(errno));
-    }
-
-    void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
-                         MAP_SHARED | MAP_POPULATE, fd, 0);
-
-    if (shm_ptr == MAP_FAILED) {
-      TORCH_CHECK(false,
-                  "mmap in SHMManager failed. errno: " + std::to_string(errno));
-    }
-
-    if (close(fd) != 0) {
-      TORCH_CHECK(
-          false, "close in SHMManager failed. errno: " + std::to_string(errno));
-    }
-
-    TORCH_CHECK((size_t)shm_ptr % 64 == 0);
-
-    return shm_ptr;
-  }
-
-  void destroy_shm() {
-    std::stringstream ss;
-    ss << "local rank " << _rank << ": [";
-    for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
-      ss << _shm_ctx[thread_id]._spinning_count << ", ";
-    }
-    ss << "]\n";
-
-    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
-      if (_shared_mem_ptrs[i] != nullptr) {
-        munmap(_shared_mem_ptrs[i], compute_shm_size());
-      }
-
-      if (!_shm_names[i].empty()) {
-        shm_unlink(_shm_names[i].c_str());
-      }
-    }
-  }
-
-  int _rank;
-  int _group_size;
-  int _thread_num;
-  std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
-  std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
-  ThreadSHMContext* _shm_ctx;
-};
-
-namespace shm_cc_ops {
-template <typename scalar_t, typename F>
-void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
-  int thread_num = ctx->thread_num;
-  int64_t total_bytes = elem_num * sizeof(scalar_t);
-  int64_t total_units_num =
-      (total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
-  int64_t per_thread_units_num =
-      (total_units_num + thread_num - 1) / thread_num;
-  int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
-  int64_t max_per_thread_iteration_elem_num =
-      PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
-  int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
-
-#pragma omp parallel for schedule(static, 1)
-  for (int i = 0; i < thread_num; ++i) {
-    int64_t offset = i * per_thread_elem_num;
-    int64_t end = std::min(elem_num, offset + per_thread_elem_num);
-    int64_t curr_elem_num =
-        std::min(max_per_thread_iteration_elem_num, end - offset);
-    ThreadSHMContext* thread_ctx = ctx + i;
-
-    while (curr_elem_num > 0) {
-      inner_func(thread_ctx, offset, curr_elem_num);
-
-      offset += max_per_thread_iteration_elem_num;
-      curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
-    }
-  }
-}
-};  // namespace shm_cc_ops
-
-namespace shm_cc_ops {
-
-void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
-  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
-  int64_t i = 0;
-#pragma GCC unroll 4
-  for (; i < aligned_bytes; i += 64) {
-    vec_op::INT8Vec64 data(
-        true, (int8_t*)src + i);  // stream loading shm to avoid caching
-    data.save((int8_t*)dst + i);
-  }
-  if (aligned_bytes < bytes) {
-    vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
-    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
-  }
-}
-
-void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
-#pragma GCC unroll 4
-  for (int64_t i = 0; i < bytes; i += 64) {
-    vec_op::INT8Vec64 data((int8_t*)src + i);
-    data.nt_save((int8_t*)dst + i);
-  }
-}
-
-void memcpy(void* dst, void* src, const int64_t bytes) {
-  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
-  int64_t i = 0;
-#pragma GCC unroll 4
-  for (; i < aligned_bytes; i += 64) {
-    vec_op::INT8Vec64 data((int8_t*)src + i);
-    data.save((int8_t*)dst + i);
-  }
-  if (aligned_bytes < bytes) {
-    vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
-    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
-  }
-}
-
-template <typename scalar_t, int RANKS>
-void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
-                         size_t elem_num) {
-  CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
-  using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
-  constexpr int64_t vec_elem_num = vec_t::get_elem_num();
-  const int worldsize = ctx->group_size;
-
-  shm_cc_ops::shm_cc_loop<scalar_t>(
-      ctx, elem_num,
-      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
-        int rank = thread_ctx->rank;
-        scalar_t* thread_shm_ptr =
-            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
-        scalar_t* thread_data_ptr = data + data_offset;
-        int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
-
-        scalar_t* remote_data_ptrs[RANKS - 1];
-        vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
-          remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
-              thread_ctx->get_swizzled_rank(idx + 1));
-        });
-
-        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
-
-        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
-                                  thread_data_elem_num);
-
-        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
-
-        int64_t aligned_data_elem_num =
-            (data_elem_num / vec_elem_num) * vec_elem_num;
-        int64_t i = 0;
-#pragma GCC unroll 4
-        for (; i < aligned_data_elem_num; i += vec_elem_num) {
-          vec_t local_data(thread_data_ptr + i);  // load from cache
-          vec_op::FP32Vec16 local_data_fp32(local_data);
-          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
-            vec_t remote_data(
-                true, remote_data_ptrs[idx] + i);  // stream load from shm
-            vec_op::FP32Vec16 remote_data_fp32(remote_data);
-            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
-          });
-          vec_t reduced_data(local_data_fp32);
-          reduced_data.save(thread_data_ptr + i);
-        }
-
-        if (i < data_elem_num) {
-          vec_t local_data(thread_data_ptr + i);  // load from cache
-          vec_op::FP32Vec16 local_data_fp32(local_data);
-          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
-            vec_t remote_data(
-                true, remote_data_ptrs[idx] + i);  // stream load from shm
-            vec_op::FP32Vec16 remote_data_fp32(remote_data);
-            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
-          });
-          vec_t reduced_data(local_data_fp32);
-          reduced_data.save(thread_data_ptr + i,
-                            data_elem_num - aligned_data_elem_num);
-        }
-
-        thread_ctx->barrier(ThreadSHMStat::DONE);
-      });
-
-  return;
-}
-};  // namespace shm_cc_ops
-
-std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
-std::mutex SHMManager::SingletonInstancesLock = {};
-
-template <typename scalar_t>
-void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
-  switch (ctx->group_size) {
-    case 2:
-      shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
-      break;
-    case 3:
-      shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
-      break;
-    case 4:
-      shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
-      break;
-    case 8:
-      shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
-      break;
-    default:
-      TORCH_CHECK(false,
-                  "Invalid world size: " + std::to_string(ctx->group_size));
-  }
-}
-
-template <typename scalar_t>
-void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
-                     scalar_t** outputs, const int dst) {
-  CPU_KERNEL_GUARD_IN(shm_gather_impl)
-  const int worldsize = ctx->group_size;
-  TORCH_CHECK_LT(dst, worldsize);
-  shm_cc_ops::shm_cc_loop<scalar_t>(
-      ctx, elem_num,
-      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
-        int rank = thread_ctx->rank;
-        scalar_t* thread_shm_ptr =
-            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
-
-        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
-
-        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
-                                  data_elem_num * sizeof(scalar_t));
-
-        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
-
-        if (rank == dst) {
-          shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
-                             data_elem_num * sizeof(scalar_t));
-          for (int i = 1; i < worldsize; ++i) {
-            int src_rank = thread_ctx->get_swizzled_rank(i);
-            scalar_t* src_ptr =
-                thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank);  // shm
-            scalar_t* dst_ptr = outputs[src_rank] + data_offset;
-            shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
-                                        data_elem_num * sizeof(scalar_t));
-          }
-        }
-
-        thread_ctx->barrier(ThreadSHMStat::DONE);
-      });
-
-  return;
-}
-
-struct MemPiece {
-  void* ptr;
-  int64_t size;
-
-  template <typename T>
-  T* data_ptr() {
-    return reinterpret_cast<T*>(ptr);
-  }
-};
-
-struct TensorListMeta {
-  int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
-  torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
-  int64_t tensor_num;
-  int64_t total_bytes;
-
-  TensorListMeta() : tensor_num(0), total_bytes(0) {
-    static_assert(sizeof(TensorListMeta) % 64 == 0);
-    static_assert(sizeof(TensorListMeta) <
-                  MIN_THREAD_PROCESS_SIZE);  // To ensure the metadata always
-                                             // hold by the thread 0
-    for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
-      tensor_bytes[i] = 0;
-      tensor_ptrs[i] = nullptr;
-      tensor_types[i] = torch::ScalarType::Undefined;
-    }
-  }
-
-  // For send and recv
-  void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
-    TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
-                "Re-bind TensorListMeta is not allowed.")
-    TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
-    tensor_num = tensor_list.size();
-    int64_t bytes_sum = 0;
-    for (int i = 0; i < tensor_list.size(); ++i) {
-      torch::Tensor& t = tensor_list[i];
-      TORCH_CHECK(t.is_contiguous());
-      tensor_bytes[i] = t.nbytes();
-      tensor_types[i] = t.scalar_type();
-      tensor_ptrs[i] = t.data_ptr();
-      bytes_sum += t.nbytes();
-    }
-    total_bytes = bytes_sum;
-  }
-
-  // For recv
-  std::vector<torch::Tensor> generate_tensor_list() {
-    std::vector<torch::Tensor> tensor_list;
-    tensor_list.reserve(tensor_num);
-
-    for (int i = 0; i < tensor_num; ++i) {
-      int64_t bytes = tensor_bytes[i];
-      auto type = tensor_types[i];
-      int64_t elem_bytes = torch::elementSize(type);
-
-      TORCH_CHECK_EQ(bytes % elem_bytes, 0);
-      int64_t elem_num = bytes / elem_bytes;
-      auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
-      tensor_list.emplace_back(torch::empty({elem_num}, options));
-    }
-    return tensor_list;
-  }
-
-  MemPiece get_data(int64_t offset) {
-    for (int i = 0; i < tensor_num; ++i) {
-      if (offset < tensor_bytes[i]) {
-        return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
-                tensor_bytes[i] - offset};
-      }
-      offset -= tensor_bytes[i];
-    }
-    return {nullptr, 0};
-  }
-
- private:
-  void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
-  int8_t _padding[40];
-};
-
-void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
-                               const std::vector<torch::Tensor>& tensor_list) {
-  CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
-  std::vector<torch::Tensor> tensor_list_with_metadata;
-  tensor_list_with_metadata.reserve(1 + tensor_list.size());
-
-  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
-  tensor_list_with_metadata.emplace_back(
-      torch::empty({sizeof(TensorListMeta)}, options));
-  tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
-                                   tensor_list.begin(), tensor_list.end());
-
-  torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
-  TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
-
-  TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
-  metadata->bind_tensor_list(tensor_list_with_metadata);
-
-  shm_cc_ops::shm_cc_loop<int8_t>(
-      ctx, metadata->total_bytes,
-      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
-        int rank = thread_ctx->rank;
-        // Wait until the receiver set the stat to DONE
-        thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
-
-        int64_t curr_shm_offset = 0;
-        while (curr_shm_offset < data_elem_num) {
-          MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
-          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
-          shm_cc_ops::memcpy(
-              thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
-              frag.ptr, frag.size);
-          curr_shm_offset += frag.size;
-        }
-
-        thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
-      });
-}
-
-std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
-                                                     int64_t src) {
-  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
-  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
-  torch::Tensor metadata_tensor =
-      torch::empty({sizeof(TensorListMeta)}, options);
-
-  // Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
-  ctx->wait_for_one(src, ThreadSHMStat::DONE);
-  shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
-                     ctx->get_thread_shm_ptr<void>(src),
-                     sizeof(TensorListMeta));
-  TensorListMeta* src_metadata =
-      reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
-  std::vector<torch::Tensor> tensor_list_with_metadata =
-      src_metadata->generate_tensor_list();
-
-  TensorListMeta metadata;
-  metadata.bind_tensor_list(tensor_list_with_metadata);
-  TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
-  TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
-
-  shm_cc_ops::shm_cc_loop<int8_t>(
-      ctx, metadata.total_bytes,
-      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
-        // Wait until the sender set the stat to SHM_DATA_READY
-        thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
-        int64_t curr_shm_offset = 0;
-        while (curr_shm_offset < data_elem_num) {
-          MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
-          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
-          shm_cc_ops::memcpy(
-              frag.ptr,
-              thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
-              frag.size);
-          curr_shm_offset += frag.size;
-        }
-
-        thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
-      });
-
-  std::vector<torch::Tensor> tensor_list;
-  tensor_list.reserve(metadata.tensor_num - 1);
-  tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
-                     tensor_list_with_metadata.end());
-
-  return tensor_list;
-}
-}  // namespace
-
-void shm_gather(int64_t handle, torch::Tensor& data,
-                const std::optional<std::vector<torch::Tensor>>& outputs,
-                int64_t dst) {
-  TORCH_CHECK(data.is_contiguous())
-  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
-    CPU_KERNEL_GUARD_IN(shm_gather_impl)
-
-    if (outputs.has_value()) {
-      TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
-      scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
-      for (int i = 0; i < outputs->size(); ++i) {
-        output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
-      }
-      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
-                      data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
-                      dst);
-    } else {
-      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
-                      data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
-                      dst);
-    }
-
-    CPU_KERNEL_GUARD_OUT(shm_gather_impl)
-  });
-}
-
-void shm_all_gather(int64_t handle, const torch::Tensor& data,
-                    torch::Tensor& output) {
-  TORCH_CHECK(data.is_contiguous())
-  TORCH_CHECK(output.is_contiguous())
-
-  const int64_t input_elem_num = data.numel();
-  const int64_t output_elem_num = output.numel();
-  TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
-  const int world_size = output_elem_num / input_elem_num;
-
-  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
-    CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
-    auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
-    TORCH_CHECK_EQ(ctx->group_size, world_size);
-
-    scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
-    for (int i = 0; i < world_size; ++i) {
-      output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
-    }
-    shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
-                    ctx->rank);
-    CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
-  });
-}
-
-void shm_allreduce(int64_t handle, torch::Tensor& data) {
-  TORCH_CHECK(data.is_contiguous())
-  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
-    CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
-    shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
-                      data.data_ptr<scalar_t>(), data.numel());
-    CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
-  });
-}
-
-void shm_send_tensor_list(int64_t handle,
-                          const std::vector<torch::Tensor>& tensor_list,
-                          int64_t dst) {
-  CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
-  shm_send_tensor_list_impl(
-      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
-  CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
-}
-
-std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
-  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
-  auto tensor_list = shm_recv_tensor_list_impl(
-      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
-  CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
-  return tensor_list;
-}
-
-int64_t init_shm_manager(const std::string& name, const int64_t group_size,
-                         const int64_t rank) {
-  return SHMManager::create_singleton_instance(name, group_size, rank);
-}
-
-std::string join_shm_manager(int64_t handle, const std::string& name) {
-  auto shm_manager = SHMManager::get_singleton_instance(handle);
-  TORCH_CHECK(shm_manager);
-  shm_manager->join(name);
-  return shm_manager->get_shm_ctx()->to_string();
-}
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -18,30 +18,6 @@ void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
                        const std::optional<torch::Tensor>& azp,
                        const std::optional<torch::Tensor>& bias);

-void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
-                        torch::Tensor& kv_cache, double scale,
-                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
-
-int64_t init_shm_manager(const std::string& name, const int64_t group_size,
-                         const int64_t rank);
-
-std::string join_shm_manager(int64_t handle, const std::string& name);
-
-void shm_allreduce(int64_t handle, torch::Tensor& data);
-
-void shm_gather(int64_t handle, torch::Tensor& data,
-                const std::optional<std::vector<torch::Tensor>>& outputs,
-                int64_t dst);
-
-void shm_all_gather(int64_t handle, const torch::Tensor& data,
-                    torch::Tensor& output);
-
-void shm_send_tensor_list(int64_t handle,
-                          const std::vector<torch::Tensor>& tensor_list,
-                          int64_t dst);
-
-std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
-
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops

@ -151,29 +127,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                  Tensor? azp, Tensor? bias) -> ()");
  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif
-
-// SHM CCL
-#ifdef __AVX512F__
-  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
-          &init_shm_manager);
-  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
-  ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
-  ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
-  ops.def(
-      "shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
-      "()");
-  ops.impl("shm_gather", torch::kCPU, &shm_gather);
-  ops.def(
-      "shm_all_gather(int handle, Tensor data, Tensor! output) -> "
-      "()");
-  ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
-  ops.def(
-      "shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
-      "()");
-  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
-  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
-          &shm_recv_tensor_list);
-#endif
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
@ -197,14 +150,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                  str kv_cache_dtype,"
      "                  Tensor k_scale, Tensor v_scale) -> ()");
  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
-
-  cache_ops.def(
-      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
-      "                     Tensor! kv_cache,"
-      "                     Tensor slot_mapping,"
-      "                     str kv_cache_dtype,"
-      "                     Tensor scale) -> ()");
-  cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
@ -212,12 +157,4 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
 }

-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) {
-  cpu_ops.def(
-      "mla_decode_kvcache("
-      "   Tensor! out, Tensor query, Tensor kv_cache,"
-      "   float scale, Tensor block_tables, Tensor seq_lens) -> ()");
-  cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
-}
-
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@ -4,11 +4,6 @@
  #include <string>
  #include <sched.h>
 #endif
-#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
-  #include <unistd.h>
-  #include <sys/syscall.h>
-  #define gettid() syscall(SYS_gettid)
-#endif

 #include "cpu_types.hpp"

@ -23,7 +18,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {

 #ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
-  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
+  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
  TORCH_CHECK(omp_cpu_mask->size > 0);
  std::vector<int> omp_cpu_ids;
  omp_cpu_ids.reserve(omp_cpu_mask->size);
--- a/csrc/cuda_view.cu
+++ b/csrc/cuda_view.cu
@ -1,39 +0,0 @@
-#include <torch/all.h>
-#include <torch/cuda.h>
-#include <cuda_runtime.h>
-
-// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
-// memory, and that UVA (Unified Virtual Addressing) is enabled.
-torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
-  TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
-
-  // Get raw host pointer from CPU tensor
-  void* host_ptr = cpu_tensor.data_ptr();
-
-  // Get a device pointer corresponding to the pinned host memory
-  void* device_ptr = nullptr;
-  cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
-  TORCH_CHECK(err == cudaSuccess,
-              "cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
-
-  // We'll use the same sizes, strides, and dtype as the CPU tensor.
-  // TODO: check if layout is respected.
-  auto sizes = cpu_tensor.sizes();
-  auto strides = cpu_tensor.strides();
-  auto options = cpu_tensor.options().device(torch::kCUDA);
-
-  // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
-  // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
-  // memory, so we don't free it here.
-  auto deleter = [](void*) {
-    // no-op, since the memory is owned by the original CPU tensor
-  };
-
-  torch::Tensor cuda_tensor =
-      torch::from_blob(device_ptr, sizes, strides, deleter, options);
-
-  TORCH_CHECK(cuda_tensor.device().is_cuda(),
-              "Resulting tensor is not on CUDA device");
-
-  return cuda_tensor;
-}
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));

 fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
                      torch::Tensor& rank_data, int64_t rank,
-                      bool fully_connected) {
+                      bool full_nvlink) {
  int world_size = fake_ipc_ptrs.size();
  if (world_size > 8)
    throw std::invalid_argument("world size > 8 is not supported");
@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
  }
  return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
                                            rank_data.numel(), rank, world_size,
-                                            fully_connected);
+                                            full_nvlink);
 }

 /**
@ -142,48 +142,3 @@ void register_graph_buffers(fptr_t _fa,
  bytes.reserve(handles.size());
  fa->register_graph_buffers(bytes, offsets);
 }
-
-std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
-    int64_t size) {
-  auto device_index = c10::cuda::current_device();
-  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
-  void* buffer;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  auto stream = c10::cuda::getCurrentCUDAStream().stream();
-  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-
-  // Allocate buffer
-#if defined(USE_ROCM)
-  // data buffers need to be "uncached" for signal on MI200
-  AT_CUDA_CHECK(
-      hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
-#else
-  AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
-#endif
-  AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
-  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
-  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-
-  // Create IPC memhandle for the allocated buffer.
-  // Will use it in open_mem_handle.
-  auto options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
-  auto handle =
-      torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
-  AT_CUDA_CHECK(
-      cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
-
-  return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
-}
-
-fptr_t open_mem_handle(torch::Tensor& mem_handle) {
-  void* ipc_ptr;
-  AT_CUDA_CHECK(cudaIpcOpenMemHandle(
-      (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
-      cudaIpcMemLazyEnablePeerAccess));
-  return reinterpret_cast<fptr_t>(ipc_ptr);
-}
-
-void free_shared_buffer(fptr_t buffer) {
-  AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
-}
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@ -5,10 +5,6 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>

-#if defined(USE_ROCM)
-typedef __hip_bfloat16 nv_bfloat16;
-#endif
-
 #include <iostream>
 #include <array>
 #include <limits>
@ -16,7 +12,6 @@ typedef __hip_bfloat16 nv_bfloat16;
 #include <unordered_map>
 #include <vector>

-namespace vllm {
 #define CUDACHECK(cmd)                                              \
  do {                                                              \
    cudaError_t e = cmd;                                            \
@ -27,37 +22,24 @@ namespace vllm {
    }                                                               \
  } while (0)

-// Maximal number of blocks in allreduce kernel.
+namespace vllm {
+
 constexpr int kMaxBlocks = 36;
-
-// Default number of blocks in allreduce kernel.
-#ifndef USE_ROCM
-const int defaultBlockLimit = 36;
-CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
-#else
-const int defaultBlockLimit = 16;
-hipPointer_attribute rangeStartAddrAttr =
-    HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
-#endif
-
 // Counter may overflow, but it's fine since unsigned int overflow is
 // well-defined behavior.
 using FlagType = uint32_t;
-
-// Two sets of peer counters are needed for two syncs: starting and ending an
-// operation. The reason is that it's possible for peer GPU block to arrive at
-// the second sync point while the current GPU block haven't passed the first
-// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
-// waiting for counter. We use alternating counter array to avoid this
-// possibility.
 struct Signal {
-  alignas(128) FlagType start[kMaxBlocks][8];
-  alignas(128) FlagType end[kMaxBlocks][8];
-  alignas(128) FlagType _flag[kMaxBlocks];  // incremental flags for each rank
+  alignas(128) FlagType self_counter[kMaxBlocks][8];
+  // Two sets of peer counters are needed for two syncs. The reason is that
+  // it's possible for peer GPU block to arrive at the second sync point while
+  // the current GPU block haven't passed the first sync point. Thus, peer GPU
+  // may write counter+1 while current GPU is busy waiting for counter. We use
+  // alternating counter array to avoid this possibility.
+  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
 };

 struct __align__(16) RankData {
-  const void* ptrs[8];
+  const void* __restrict__ ptrs[8];
 };

 struct __align__(16) RankSignals {
@ -152,29 +134,27 @@ DINLINE O downcast(array_t<float, O::size> val) {
  }
 }

-#if !defined(USE_ROCM)
-
 static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
               "l"(flag_addr));
-  #else
+#else
  asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
               "l"(flag_addr));
-  #endif
+#endif
 }

 static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
  FlagType flag;
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
               : "=r"(flag)
               : "l"(flag_addr));
-  #else
+#else
  asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
               : "=r"(flag)
               : "l"(flag_addr));
-  #endif
+#endif
  return flag;
 }

@ -190,99 +170,37 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
  return flag;
 }

-// This function is meant to be used as the first synchronization in the all
-// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
-// prior memory accesses. Note: volatile writes will not be reordered against
-// other volatile writes.
-template <int ngpus>
-DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
-                              int rank) {
-  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+// is_start: whether this is the very first synchronization barrier.
+// need_fence: whether a memory fence is needed. If true, a release-acquire
+// semantic is used to enforce memory access order before and after this
+// barrier.
+template <int ngpus, bool is_start, bool need_fence = false>
+DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
+                               int rank) {
+  if constexpr (!is_start) __syncthreads();
+  static_assert(
+      !(is_start && need_fence));  // Start barrier shouldn't need fence.
  if (threadIdx.x < ngpus) {
-    auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank];
-    auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x];
-    // Write the expected counter value to peer and wait for correct value
-    // from peer.
-    st_flag_volatile(peer_counter_ptr, flag);
-    while (ld_flag_volatile(self_counter_ptr) != flag);
-  }
-  __syncthreads();
-  // use one thread to update flag
-  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
-}
-
-// This function is meant to be used as the second or the final
-// synchronization barrier in the all reduce kernel. If it's the final
-// synchronization barrier, we don't need to make any visibility guarantees
-// for prior memory accesses.
-template <int ngpus, bool final_sync = false>
-DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
-  __syncthreads();
-  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
-  if (threadIdx.x < ngpus) {
-    auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
-    auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
+    // Increment the counter. Technically we only need one counter, but we use
+    // multiple per block to eliminate the need to share the counter via smem.
+    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
    // Write the expected counter value to peer and wait for correct value from
    // peer.
-    if constexpr (!final_sync) {
-      st_flag_release(peer_counter_ptr, flag);
-      while (ld_flag_acquire(self_counter_ptr) != flag);
+    auto peer_counter_ptr =
+        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
+    auto self_counter_ptr =
+        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
+    if constexpr (need_fence) {
+      st_flag_release(peer_counter_ptr, val);
+      while (ld_flag_acquire(self_counter_ptr) != val);
    } else {
-      st_flag_volatile(peer_counter_ptr, flag);
-      while (ld_flag_volatile(self_counter_ptr) != flag);
+      st_flag_volatile(peer_counter_ptr, val);
+      while (ld_flag_volatile(self_counter_ptr) != val);
    }
  }
-  if constexpr (!final_sync) __syncthreads();
-
-  // use one thread to update flag
-  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+  if constexpr (is_start || need_fence) __syncthreads();
 }

-#else
-
-template <int ngpus>
-DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
-                              int rank) {
-  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
-  if (threadIdx.x < ngpus) {
-    // simultaneously write to the corresponding flag of all ranks.
-    // Latency = 1 p2p write
-    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
-                            flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
-    // wait until we got true from all ranks
-    while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
-                                  __ATOMIC_RELAXED,
-                                  __MEMORY_SCOPE_DEVICE) < flag);
-  }
-  __syncthreads();
-  // use one thread to update flag
-  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
-}
-
-template <int ngpus, bool final_sync = false>
-DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
-  __syncthreads();
-  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
-  if (threadIdx.x < ngpus) {
-    // simultaneously write to the corresponding flag of all ranks.
-    // Latency = 1 p2p write
-    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
-                            flag,
-                            final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
-                            __MEMORY_SCOPE_SYSTEM);
-    // wait until we got true from all ranks
-    while (
-        __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
-                               final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
-                               __MEMORY_SCOPE_DEVICE) < flag);
-  }
-  if constexpr (!final_sync) __syncthreads();
-  // use one thread to update flag
-  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
-}
-
-#endif
-
 template <typename P, int ngpus, typename A>
 DINLINE P packed_reduce(const P* ptrs[], int idx) {
  A tmp = upcast(ptrs[0][idx]);
@ -302,13 +220,13 @@ __global__ void __launch_bounds__(512, 1)
  // note: we don't reorder the address so the accumulation order is the same
  // for all ranks, ensuring bitwise identical results
  auto dp = *_dp;
-  barrier_at_start<ngpus>(sg, self_sg, rank);
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
  // do the actual reduction
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
       idx += gridDim.x * blockDim.x) {
    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
  }
-  barrier_at_end<ngpus, true>(sg, self_sg, rank);
+  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
 }

 template <typename P>
@ -337,20 +255,18 @@ __global__ void __launch_bounds__(512, 1)
    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
  }
  auto tmp_out = tmps[0];
-  barrier_at_start<ngpus>(sg, self_sg, rank);
-
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
  // stage 1: reduce scatter
  for (int idx = start + tid; idx < end; idx += stride) {
    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
  }
-  barrier_at_end<ngpus>(sg, self_sg, rank);
+  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);

  // stage 2: allgather. Note: it's important to match the tid between
  // the two stages, because visibility across devices is only guaranteed
  // between threads that have the same tid. If thread i computes the sum of
-  // start + i in the first stage, then thread i also gathers start + i from
-  // all ranks.
-
+  // start + i in the first stage, then thread i also gathers start + i from all
+  // ranks.
  for (int idx = tid; idx < largest_part; idx += stride) {
 #pragma unroll
    for (int i = 0; i < ngpus; i++) {
@ -371,22 +287,21 @@ class CustomAllreduce {
 public:
  int rank_;
  int world_size_;
-  // Full NVLink or xGMI connection between GPUs.
-  bool fully_connected_;
+  bool full_nvlink_;

  RankSignals sg_;
-  // Stores a map from a pointer to its peer pointers from all ranks.
+  // Stores an map from a pointer to its peer pointters from all ranks.
  std::unordered_map<void*, RankData*> buffers_;
  Signal* self_sg_;

  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
  // For cuda graph to work, all kernel arguments must be fixed during graph
-  // capture time. However, the peer pointers are not known during graph
-  // capture time. Therefore, during capture, we increment the rank data
-  // pointer and use that as the argument to the kernel. The kernel arguments
-  // are stored in graph_unreg_buffers_. The actual peer pointers will be
-  // filled in at the memory pointed to by the pointers in
-  // graph_unreg_buffers_ when the IPC handles are exchanged between ranks.
+  // capture time. However, the peer pointers are not known during graph capture
+  // time. Therefore, during capture, we increment the rank data pointer and use
+  // that as the argument to the kernel. The kernel arguments are stored in
+  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
+  // memory pointed to by the pointers in graph_unreg_buffers_ when
+  // the IPC handles are exchanged between ranks.
  //
  // The overall process looks like this:
  // 1. Graph capture.
@ -404,18 +319,17 @@ class CustomAllreduce {
   * Signals are an array of ipc-enabled buffers from all ranks.
   * For each of the buffer, the layout is as follows:
   * | -- sizeof(Signal) -- | ------ a few MB ----- |
-   * The first section is for allreduce synchronization, and the second
-   * section is for storing the intermediate results required by some
-   * allreduce algos.
+   * The first section is for allreduce synchronization, and the second section
+   * is for storing the intermediate results required by some allreduce algos.
   *
   * Note: this class does not own any device memory. Any required buffers
   * are passed in from the constructor.
   */
  CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
-                  int rank, int world_size, bool fully_connected = true)
+                  int rank, int world_size, bool full_nvlink = true)
      : rank_(rank),
        world_size_(world_size),
-        fully_connected_(fully_connected),
+        full_nvlink_(full_nvlink),
        self_sg_(signals[rank]),
        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
@ -447,7 +361,8 @@ class CustomAllreduce {
      void* base_ptr;
      // note: must share the base address of each allocation, or we get wrong
      // address
-      if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr,
+      if (cuPointerGetAttribute(&base_ptr,
+                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
                                (CUdeviceptr)ptr) != CUDA_SUCCESS)
        throw std::runtime_error("failed to get pointer attr");
      CUDACHECK(cudaIpcGetMemHandle(
@ -481,11 +396,11 @@ class CustomAllreduce {

  // Note: when registering graph buffers, we intentionally choose to not
  // deduplicate the addresses. That means if the allocator reuses some
-  // addresses, they will be registered again. This is to account for the
-  // remote possibility of different allocation patterns between ranks. For
-  // example, rank 1 may get the same input address for the second allreduce,
-  // but rank 2 got a different address. IPC handles have internal reference
-  // counting mechanism so overhead should be small.
+  // addresses, they will be registered again. This is to account for the remote
+  // possibility of different allocation patterns between ranks. For example,
+  // rank 1 may get the same input address for the second allreduce, but rank 2
+  // got a different address. IPC handles have internal reference counting
+  // mechanism so overhead should be small.
  void register_graph_buffers(
      const std::vector<std::string>& handles,
      const std::vector<std::vector<int64_t>>& offsets) {
@ -516,15 +431,15 @@ class CustomAllreduce {
  /**
   * Performs allreduce, assuming input has already been registered.
   *
-   * Block and grid default configs are results after careful grid search.
-   * Using 36 blocks give the best or close to the best runtime on the devices
-   * I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also
-   * only take a small amount of SMs. Not quite sure the underlying reason,
-   * but my guess is that too many SMs will cause contention on NVLink bus.
+   * Block and grid default configs are results after careful grid search. Using
+   * 36 blocks give the best or close to the best runtime on the devices I
+   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
+   * take a small amount of SMs. Not quite sure the underlying reason, but my
+   * guess is that too many SMs will cause contention on NVLink bus.
   */
  template <typename T>
  void allreduce(cudaStream_t stream, T* input, T* output, int size,
-                 int threads = 512, int block_limit = defaultBlockLimit) {
+                 int threads = 512, int block_limit = 36) {
    auto d = packed_t<T>::P::size;
    if (size % d != 0)
      throw std::runtime_error(
@ -558,11 +473,13 @@ class CustomAllreduce {
 #define KL(ngpus, name)                                                       \
  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                 rank_, size);
+    // TODO(hanzhi713): Threshold is different for A100 and H100.
+    // Add per device threshold.
 #define REDUCE_CASE(ngpus)                            \
  case ngpus: {                                       \
    if (world_size_ == 2) {                           \
      KL(ngpus, cross_device_reduce_1stage);          \
-    } else if (fully_connected_) {                    \
+    } else if (full_nvlink_) {                        \
      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
          (world_size_ <= 8 && bytes < 256 * 1024)) { \
        KL(ngpus, cross_device_reduce_1stage);        \
@ -580,8 +497,7 @@ class CustomAllreduce {
      REDUCE_CASE(8)
      default:
        throw std::runtime_error(
-            "custom allreduce only supports num gpus in (2,4,6,8). Actual "
-            "num "
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
            "gpus = " +
            std::to_string(world_size_));
    }
@ -595,11 +511,10 @@ class CustomAllreduce {
    }
  }
 };
-
 /**
- * To inspect PTX/SASS, copy paste this header file to compiler explorer and
- add a template instantiation:
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
+ a template instantiation:
 * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
 half *, int, int, int);
 */
-}  // namespace vllm
+}  // namespace vllm
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@ -1,9 +1,9 @@
 /**
 * This is a standalone test for custom allreduce.
 * To compile, make sure you have MPI and NCCL installed in your system.
- * export MPI_HOME=XXX
+ * export MPI_HOME=xxx
 * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
- * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
+ * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
 *
 * Warning: this C++ test is not designed to be very readable and was used
 * during the rapid prototyping process.
@ -22,15 +22,7 @@
 #include "cuda_profiler_api.h"
 #include "custom_all_reduce.cuh"
 #include "mpi.h"
-#ifdef USE_ROCM
-  #include <hip/hip_bf16.h>
-typedef __hip_bfloat16 nv_bfloat16;
-  #include "rccl/rccl.h"
-  #include "custom_all_reduce_hip.cuh"
-#else
-  #include "nccl.h"
-  #include "custom_all_reduce.cuh"
-#endif
+#include "nccl.h"

 #define MPICHECK(cmd)                                                  \
  do {                                                                 \
@ -51,29 +43,16 @@ typedef __hip_bfloat16 nv_bfloat16;
    }                                                               \
  } while (0)

-#ifdef USE_ROCM
 __global__ void dummy_kernel() {
-  for (int i = 0; i < 100; i++) {
-    uint64_t start = wall_clock64();
-    uint64_t cycles_elapsed;
-    do {
-      cycles_elapsed = wall_clock64() - start;
-    } while (cycles_elapsed < 100);
-  }
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
-}
 #else
-__global__ void dummy_kernel() {
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
-  #else
  for (int i = 0; i < 100; i++) {
    long long int start = clock64();
    while (clock64() - start < 150000000);  // approximately 98.4ms on P40
  }
-  #endif
-}
 #endif
+}

 template <typename T>
 __global__ void set_data(T* data, int size, int myRank) {
@ -142,14 +121,8 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
   * registration, they are allocated and registered together in the test for
   * convenience.
   */
-#ifdef USE_ROCM
-  CUDACHECK(hipExtMallocWithFlags(
-      (void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
-      hipDeviceMallocUncached));
-#else
  CUDACHECK(
      cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
-#endif
  CUDACHECK(
      cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
  CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
@ -338,18 +311,13 @@ int main(int argc, char** argv) {

  bool performance_test = true;
  cudaProfilerStart();
-// Uncomment to scan through different block size configs.
-// for (int threads : {256, 512, 1024}) {
-//   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
-//     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
-//     performance_test);
-//   }
-// }
-#ifdef USE_ROCM
-  const int block_limit = 16;
-#else
-  const int block_limit = 36;
-#endif
+  // Uncomment to scan through different block size configs.
+  // for (int threads : {256, 512, 1024}) {
+  //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
+  //     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
+  //     performance_test);
+  //   }
+  // }
  // Scan through different sizes to test performance.
  for (int sz = 512; sz <= (8 << 20); sz *= 2) {
    run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
@ -358,4 +326,4 @@ int main(int argc, char** argv) {
  cudaProfilerStop();
  MPICHECK(MPI_Finalize());
  return EXIT_SUCCESS;
-}
+}
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@ -48,14 +48,4 @@ struct enable_sm90_or_later : Kernel {
    Kernel::operator()(std::forward<Args>(args)...);
 #endif
  }
-};
-
-template <typename Kernel>
-struct enable_sm90_only : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
-    Kernel::operator()(std::forward<Args>(args)...);
-#endif
-  }
-};
+};
--- a/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
@ -1,457 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-//
-// This file is a modified excerpt of
-// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
-// from https://github.com/NVIDIA/cutlass v3.5.0
-// It has been modified to support either row/column or scalar broadcasting
-// where the tensor being loaded from is always passed in via a device pointer.
-// This lets one compiled kernel handle all cases of per-tensor or
-// per-channel/per-token quantization.
-//
-// This interface also allows the scales to be passed in as tensors that
-// consistently reside on the device, which avoids an issue with a previous
-// implementation where scalars needed to be on the CPU since they
-// were passed in via float values. This created a potential performance hazard
-// if scales were initially on the device, and caused torch.compile graphs
-// breaks when moving scales to the CPU.
-//
-#pragma once
-
-// Turn off clang-format for the entire file to keep it close to upstream
-// clang-format off
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/barrier.h"
-
-#include "cute/tensor.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
-
-namespace cutlass::epilogue::fusion {
-
-using namespace cute;
-using namespace detail;
-
-// Row vector broadcast
-template<
-  int Stages,
-  class CtaTileShapeMNK,
-  class Element,
-  class StrideMNL = Stride<_0,_1,_0>,
-  int Alignment = 128 / sizeof_bits_v<Element>
->
-struct Sm90RowOrScalarBroadcastArray {
-  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
-
-  struct SharedStorage { 
-    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
-  };
-
-  // This struct has been modified to have a bool indicating that ptr_row is a 
-  // scalar that must be broadcast, instead of containing a scalar that is 
-  // valid if ptr_row is null.
-  struct Arguments {
-    const Element* const* ptr_row_array = nullptr;
-    bool row_broadcast = true;
-    StrideMNL dRow = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90RowOrScalarBroadcastArray() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90RowOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
-      : params(params)
-      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
-
-  Params params;
-  Element *smem = nullptr;
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return (!params.row_broadcast && *(params.ptr_row_array[group]) == Element(0));
-  }
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
-        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
-        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
-        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_,
-        int group, Params const& params_)
-      : tGS_gRow(tGS_gRow_)
-      , tGS_sRow(tGS_sRow_)
-      , tGS_cRow(tGS_cRow_)
-      , tiled_G2S(tiled_g2s_)
-      , tSR_sRow(tSR_sRow_)
-      , tSR_rRow(tSR_rRow_)
-      , tCcRow(tCcRow_)
-      , residue_tCcRow(residue_tCcRow_)
-      , group(group)
-      , params(params_) {}
-
-    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
-    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
-    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
-    Tiled_G2S tiled_G2S;
-
-    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
-  
-    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    ThrResidue residue_tCcRow;                                                   // (m, n)
-    ThrNum thr_num;
-    int group;
-    Params const& params;
-
-    CUTLASS_DEVICE void
-    begin() {
-      if (!params.row_broadcast) {
-        fill(tSR_rRow, *(params.ptr_row_array[group]));
-        return;
-      }
-
-      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
-      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
-      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
-      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
-
-      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
-        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
-          continue; // OOB of SMEM, 
-        }
-        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
-          tGS_sRow_flt(i) = tGS_gRow_flt(i);
-        }
-        else {
-          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
-        }
-      }
-      synchronize();
-    }
-
-    CUTLASS_DEVICE void
-    begin_loop(int epi_m, int epi_n) {
-      if (epi_m == 0) { // Assumes M-major subtile loop
-        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
-        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
-        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
-        copy(tSR_sRow_flt, tSR_rRow_flt);
-      }
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<Element, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<Element, FragmentSize> frg_row;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
-      }
-
-      return frg_row;
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-    using ThreadCount = decltype(size(args.tiled_copy));
-
-    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row_array[l]), make_shape(M,N,1), params.dRow);
-    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
-    Tensor sRow = make_tensor(make_smem_ptr(smem), 
-        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
-    //// G2S: Gmem to Smem
-    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
-                                     Layout< Shape<_1, ThreadCount>, 
-                                            Stride<_0,          _1>>{}, 
-                                     Layout<_1>{});   
-    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
-    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
-    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
-
-    //// G2S: Coord 
-    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
-    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
-
-    //// S2R: Smem to Reg
-    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
-
-    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
-      tGS_gRow, 
-      tGS_sRow, 
-      tGS_cRow, tiled_g2s, 
-      tSR_sRow, 
-      tSR_rRow, 
-      args.tCcD, 
-      args.residue_cD,
-      ThreadCount{}, 
-      l,
-      params);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Column vector broadcast
-template<
-  int Stages,
-  class CtaTileShapeMNK,
-  class Element,
-  class StrideMNL = Stride<_1,_0,_0>,
-  int Alignment = 128 / sizeof_bits_v<Element>
->
-struct Sm90ColOrScalarBroadcastArray {
-  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
-  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
-  static_assert(
-    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
-    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
-
-  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
-  struct SharedStorage { };
-
-  // This struct has been modified to have a bool indicating that ptr_col is a 
-  // scalar that must be broadcast, instead of containing a scalar that is 
-  // valid if ptr_col is null.
-  struct Arguments {
-    const Element* const* ptr_col_array = nullptr;
-    bool col_broadcast = true;
-    StrideMNL dCol = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return (!params.col_broadcast && *(params.ptr_col_array[group]) == Element(0));
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ColOrScalarBroadcastArray() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ColOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
-      : params(params) { }
-
-  Params params;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-      GTensor&& tCgCol,
-      RTensor&& tCrCol,
-      CTensor&& tCcCol,
-      ProblemShape problem_shape,
-      int group,
-      Params const& params
-    ): 
-      tCgCol(cute::forward<GTensor>(tCgCol)),
-      tCrCol(cute::forward<RTensor>(tCrCol)),
-      tCcCol(cute::forward<CTensor>(tCcCol)),
-      m(get<0>(problem_shape)),
-      group(group),
-      params(params) {}
-
-    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    RTensor tCrCol;
-    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    Params const& params;
-    int m;
-    int group;
-
-    CUTLASS_DEVICE void
-    begin() {
-      Tensor pred = make_tensor<bool>(shape(tCgCol));
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(pred); ++i) {
-        pred(i) = get<0>(tCcCol(i)) < m;
-      }
-
-      if (!params.col_broadcast) {
-        fill(tCrCol, *(params.ptr_col_array[group]));
-        return;
-      }
-
-      // Filter so we don't issue redundant copies over stride-0 modes
-      // (only works if 0-strides are in same location, which is by construction)
-      copy_if(pred, filter(tCgCol), filter(tCrCol));
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<Element, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<Element, FragmentSize> frg_col;
-      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
-      }
-
-      return frg_col;
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-
-    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col_array[l]), make_shape(M,N,1), params.dCol);
-    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-
-    // Generate an identity tensor matching the shape of the global tensor and 
-    //  partition the same way, this will be used to generate the predicate
-    //  tensor for loading
-    Tensor cCol = make_identity_tensor(mCol.shape());
-    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-
-    return ConsumerStoreCallbacks(
-      cute::move(tCgCol), 
-      cute::move(tCrCol), 
-      cute::move(tCcCol), 
-      args.problem_shape_mnkl,
-      l,
-      params
-    );
-  }
-};
-
-}
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@ -1,7 +1,6 @@
 #pragma once

 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
-#include "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp"

 /*
   This file defines custom epilogues for fusing channel scales, token scales,
@ -70,16 +69,6 @@ struct ScaledEpilogueBase {
      0 /*Stages*/, TileShape, T, T, Stride<Int<0>, Int<1>, Int<0>>,
      128 / sizeof_bits_v<T>, EnableNullPtr>;

-  template <typename T>
-  using ColOrScalarLoadArray =
-      cutlass::epilogue::fusion::Sm90ColOrScalarBroadcastArray<
-          0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
-
-  template <typename T>
-  using RowOrScalarLoadArray =
-      cutlass::epilogue::fusion::Sm90RowOrScalarBroadcastArray<
-          0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
-
  // This utility function constructs the arguments for the load descriptors
  // from a tensor. It can handle both row and column, as well as row/column or
  // scalar cases.
@ -107,14 +96,6 @@ struct ScaledEpilogueBase {
                  std::is_same_v<Descriptor, RowLoad<T, true>>);
    return Arguments{data_ptr};
  }
-
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(const T* const* data_ptr, bool do_broadcast) {
-    using Arguments = typename Descriptor::Arguments;
-    static_assert(std::is_same_v<Descriptor, ColOrScalarLoadArray<T>> ||
-                  std::is_same_v<Descriptor, RowOrScalarLoadArray<T>>);
-    return Arguments{data_ptr, do_broadcast};
-  }
 };

 /*
@ -400,51 +381,4 @@ struct ScaledEpilogueBiasAzpToken
  }
 };

-/*
-    This epilogue works like ScaledEpilogue, but ScaleA and ScaleB are pointers
-    to arrays containing different scales used in group gemm. The number of
-   pointers in ScaleA and the number of pointers in ScaleB are equal to the
-   group size.
-*/
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueArray
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoadArray<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoadArray<float>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  using ScaleAArray = typename SUPER::template ColOrScalarLoadArray<float>;
-  using ScaleBArray = typename SUPER::template RowOrScalarLoadArray<float>;
-
-  static ArgumentType prepare_args(float const* const* a_scales_ptr,
-                                   float const* const* b_scales_ptr,
-                                   bool a_col_broadcast, bool b_row_broadcast) {
-    auto a_args = SUPER::template args_from_tensor<ScaleAArray, float>(
-        a_scales_ptr, a_col_broadcast);
-    auto b_args = SUPER::template args_from_tensor<ScaleBArray, float>(
-        b_scales_ptr, b_row_broadcast);
-
-    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
-    return ArgumentType{a_args, evt0_args, {}};
-  }
-};
-
 };  // namespace vllm::c3x
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
rshaw@neuralmagic.com	220d694080	updated Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-24 01:00:20 +00:00
rshaw@neuralmagic.com	70e06dd574	updated Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-24 00:46:46 +00:00
rshaw@neuralmagic.com	7954461d4c	updated Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-23 23:03:42 +00:00
rshaw@neuralmagic.com	a10da86677	updated Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-23 22:56:53 +00:00
rshaw@neuralmagic.com	284d5df45b	added __init__.py Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-23 22:50:20 +00:00
rshaw@neuralmagic.com	d5b0db449e	added __init__.py Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-23 22:44:36 +00:00
rshaw@neuralmagic.com	66349c33a1	updated Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-23 22:36:57 +00:00
rshaw@neuralmagic.com	28d0396ff1	updated Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-23 21:54:04 +00:00
rshaw@neuralmagic.com	2f29ae383a	added files Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-23 21:45:01 +00:00
rshaw@neuralmagic.com	cf64b0e6a7	updated Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-23 21:44:14 +00:00
rshaw@neuralmagic.com	f51f182d64	pre-commit Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-23 20:18:50 +00:00
Robert Shaw	79e465f557	fix pre-commit Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-23 09:38:55 -04:00
Robert Shaw	2ba687d39f	updated Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 18:52:06 -04:00
Robert Shaw	5d57896e2c	updated Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 18:51:53 -04:00
Robert Shaw	f6f008ca1d	cleanup Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 18:51:21 -04:00
Robert Shaw	24cbbe4778	updated Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 18:50:48 -04:00
Robert Shaw	2fec6e0b5c	working? Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 18:45:00 -04:00
Robert Shaw	47a3f26b2a	updated Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 18:36:52 -04:00
Robert Shaw	144162fc8c	Merge branch 'main' into rob-fixes	2025-03-22 17:12:53 -04:00
Robert Shaw	522279ebb9	Stash Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 17:12:21 -04:00
Robert Shaw	85687b43e7	updated Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 17:00:46 -04:00
Robert Shaw	120bbdfd82	updated Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 15:58:51 -04:00
Robert Shaw	2ceb7bc534	updated Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 13:25:05 -04:00
Robert Shaw	9f7fb5ec84	updated Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 13:22:00 -04:00
Robert Shaw	a8a621e419	updated Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-03-22 13:11:50 -04:00
clark	b89d89f456	fix rebase Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:32:21 +08:00
clark	8355358fb3	add unlimited HWM Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:20:12 +08:00
clark	c0b1443345	fix mypy Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:20:12 +08:00
clark	d35dace985	refactor zmq msg to object Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:20:12 +08:00
clark	912031ceb5	refactor disagg Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:20:12 +08:00
clark	4f13e89143	fix SIM105 Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:18:19 +08:00
clark	b9a7dbe769	remove default socket address value Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:18:19 +08:00
clark	0cb2e05256	change log level and fix some comments Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:18:19 +08:00
clark	d6945ecdf0	change disagg_prefill example to use zmq Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:18:19 +08:00
clark	298298f97d	remove invalid zmq benchmark code Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:18:19 +08:00
clark	6c8fae82dd	run format Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:18:19 +08:00
clark	16ed827378	add benchmark shell Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:18:08 +08:00
clark	8fa9df7987	run format.sh Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:17:57 +08:00
clark	27c1afe88b	fix ThreadProxy Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:17:57 +08:00
clark	ee6607332e	create proxy sockets in the proxy function for thread safety Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:17:57 +08:00
clark	7fbf70db57	1. replace tpc:// with ipc:// \n 2. fix json response Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:17:57 +08:00
clark	2c31e4c3ea	Run yapf and ruff Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:17:57 +08:00
clark	187f112ccd	1. fix mypy issue Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:17:44 +08:00
clark	897db7b93d	Replace zmq.asyncio.Context().term() with zmq.asyncio.Context().destroy(linger=0) for immediate termination Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:17:44 +08:00
clark	b7ffb43792	update disagg_connect test_request.py Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:17:44 +08:00
clark	6e1fba8a73	1. connect_parser set --prefill-addr and --decode-addr are required 2.To more accurately reflect its purpose, we will rename connect.py to disagg_connector.py. Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:17:44 +08:00
clark	bfde1688e7	add /v1/completions stream support Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:17:44 +08:00
clark	905424ed65	add identity url headers Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:15:42 +08:00
clark	5d20f389d6	add vllm connect cmd Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:15:42 +08:00
clark	2a0cb78016	add test py Signed-off-by: clark <panf2333@gmail.com>	2025-03-21 08:15:42 +08:00