Add time comment

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Merge branch 'main' into mamba_tests
2025-03-17 13:50:44 +00:00 · 2025-03-17 13:49:56 +00:00 · 2025-03-15 17:31:21 +00:00 · 2025-03-15 17:26:01 +00:00
1446 changed files with 41942 additions and 121614 deletions
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@ -1,4 +1,3 @@
-# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@ -1,4 +1,3 @@
-# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
-model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.335
-  - name: "exact_match,flexible-extract"
-    value: 0.323
-limit: 1319
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
-model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.30
-  - name: "exact_match,flexible-extract"
-    value: 0.465
-limit: 1319
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
-model_name: "Qwen/Qwen2.5-1.5B-Instruct"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.54
-  - name: "exact_match,flexible-extract"
-    value: 0.59
-limit: 1319
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
-model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.47
-  - name: "exact_match,flexible-extract"
-    value: 0.64
-limit: 1319
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
-Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,6 +1,10 @@
-Qwen2.5-1.5B-Instruct.yaml
+Meta-Llama-3-8B-Instruct.yaml
+Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
-Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
-Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+Minitron-4B-Base-FP8.yaml
+Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+Qwen2-1.5B-Instruct-FP8W8.yaml
+Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from pathlib import Path
-
-import pytest
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--config-list-file",
-        action="store",
-        help="Path to the file listing model config YAMLs (one per line)")
-    parser.addoption("--tp-size",
-                     action="store",
-                     default="1",
-                     help="Tensor parallel size to use for evaluation")
-
-
-@pytest.fixture(scope="session")
-def config_list_file(pytestconfig, config_dir):
-    rel_path = pytestconfig.getoption("--config-list-file")
-    return config_dir / rel_path
-
-
-@pytest.fixture(scope="session")
-def tp_size(pytestconfig):
-    return pytestconfig.getoption("--tp-size")
-
-
-def pytest_generate_tests(metafunc):
-    if "config_filename" in metafunc.fixturenames:
-        rel_path = metafunc.config.getoption("--config-list-file")
-        config_list_file = Path(rel_path).resolve()
-        config_dir = config_list_file.parent
-        with open(config_list_file, encoding="utf-8") as f:
-            configs = [
-                config_dir / line.strip() for line in f
-                if line.strip() and not line.startswith("#")
-            ]
-        metafunc.parametrize("config_filename", configs)
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using vllm and compares to "
+    echo "precomputed baseline (measured by HF transformers.)"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
+    echo "  -t    - tensor parallel size"
+    echo
+}
+
+SUCCESS=0
+
+while getopts "c:t:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+# Parse list of configs.
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
+
+for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
+do
+    LOCAL_SUCCESS=0
+    
+    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
+
+    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
+    export LM_EVAL_TP_SIZE=$TP_SIZE
+    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
+
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
+    else
+        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
+    fi
+
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -3,25 +3,35 @@
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml

-pytest -s -v test_lm_eval_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
+* export LM_EVAL_TP_SIZE=4 
+* pytest -s test_lm_eval_correctness.py
 """

+import os
+from pathlib import Path
+
 import lm_eval
-import numpy as np
+import numpy
+import pytest
 import yaml

-RTOL = 0.08
+RTOL = 0.05
+TEST_DATA_FILE = os.environ.get(
+    "LM_EVAL_TEST_DATA_FILE",
+    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
+
+TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)


-def launch_lm_eval(eval_config, tp_size):
+def launch_lm_eval(eval_config):
    trust_remote_code = eval_config.get('trust_remote_code', False)
+
    model_args = f"pretrained={eval_config['model_name']}," \
-                 f"tensor_parallel_size={tp_size}," \
-                 f"enforce_eager=true," \
+                 f"tensor_parallel_size={TP_SIZE}," \
                 f"add_bos_token=true," \
                 f"trust_remote_code={trust_remote_code}"
+
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
@ -29,14 +39,22 @@ def launch_lm_eval(eval_config, tp_size):
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
        batch_size="auto")
+
    return results


-def test_lm_eval_correctness_param(config_filename, tp_size):
-    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+def test_lm_eval_correctness():
+    eval_config = yaml.safe_load(
+        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))

-    results = launch_lm_eval(eval_config, tp_size)
+    if eval_config[
+            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
+        pytest.skip("FBGEMM is currently failing on main.")

+    # Launch eval requests.
+    results = launch_lm_eval(eval_config)
+
+    # Confirm scores match ground truth.
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
@ -44,7 +62,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
            measured_value = results["results"][task["name"]][metric["name"]]
            print(f'{task["name"]} | {metric["name"]}: '
                  f'ground_truth={ground_truth} | measured={measured_value}')
-            success = success and np.isclose(
+            success = success and numpy.isclose(
                ground_truth, measured_value, rtol=RTOL)

+    # Assert at the end, print all scores even on failure for debugging.
    assert success
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -10,24 +10,15 @@ set -x
 set -o pipefail

 check_gpus() {
-  if command -v nvidia-smi; then
-    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
-  fi
-
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  if command -v nvidia-smi; then
-    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
-  elif command -v amd-smi; then
-    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
-  fi
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }

@ -99,15 +90,9 @@ kill_gpu_processes() {


  # wait until GPU memory usage smaller than 1GB
-  if command -v nvidia-smi; then
-    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-      sleep 1
-    done
-  elif command -v amd-smi; then
-    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
-      sleep 1
-    done
-  fi
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done

  # remove vllm config file
  rm -rf ~/.config/vllm
@ -376,7 +361,7 @@ main() {
  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOGGING_LEVEL="WARNING"
+  export VLLM_LOG_LEVEL="WARNING"

  # prepare for benchmarking
  cd benchmarks || exit 1
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -63,12 +63,10 @@
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "speculative_config": {
-                "model": "turboderp/Qwama-0.5B-Instruct",
-                "num_speculative_tokens": 4,
-                "draft_tensor_parallel_size": 1
-            }
+            "swap_space": 16, 
+            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
+            "num_speculative_tokens": 4,
+            "speculative_draft_tensor_parallel_size": 1
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,23 +1,23 @@
 steps:
-  - label: "Build wheel - CUDA 12.8"
+  - label: "Build wheel - CUDA 12.4"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  - label: "Build wheel - CUDA 12.6"
+  - label: "Build wheel - CUDA 12.1"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -31,10 +31,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -48,7 +48,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Build and publish TPU release image"
@ -57,9 +57,7 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
-      - "yes | docker system prune -a"
-      - "git fetch --all"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
@ -84,22 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
-
-  - block: "Build Neuron release image"
-    key: block-neuron-release-image-build
-    depends_on: ~
-
-  - label: "Build and publish Neuron release image"
-    depends_on: block-neuron-release-image-build
-    agents:
-      queue: neuron-postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
-    env:
-      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -75,84 +75,49 @@ HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 #ignore certain kernels tests
-if [[ $commands == *" kernels/core"* ]]; then
+if [[ $commands == *" kernels "* ]]; then
  commands="${commands} \
-  --ignore=kernels/core/test_fused_quant_layernorm.py \
-  --ignore=kernels/core/test_permute_cols.py"
-fi
-
-if [[ $commands == *" kernels/attention"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/attention/stest_attention_selector.py \
-  --ignore=kernels/attention/test_blocksparse_attention.py \
-  --ignore=kernels/attention/test_encoder_decoder_attn.py \
-  --ignore=kernels/attention/test_attention_selector.py \
-  --ignore=kernels/attention/test_flash_attn.py \
-  --ignore=kernels/attention/test_flashinfer.py \
-  --ignore=kernels/attention/test_prefix_prefill.py \
-  --ignore=kernels/attention/test_cascade_flash_attn.py \
-  --ignore=kernels/attention/test_mha_attn.py \
-  --ignore=kernels/attention/test_lightning_attn.py \
-  --ignore=kernels/attention/test_attention.py"
-fi
-
-if [[ $commands == *" kernels/quantization"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/quantization/test_int8_quant.py \
-  --ignore=kernels/quantization/test_aqlm.py \
-  --ignore=kernels/quantization/test_machete_mm.py \
-  --ignore=kernels/quantization/test_block_fp8.py \
-  --ignore=kernels/quantization/test_block_int8.py \
-  --ignore=kernels/quantization/test_marlin_gemm.py \
-  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
-  --ignore=kernels/quantization/test_int8_kernel.py"
-fi
-
-if [[ $commands == *" kernels/mamba"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/mamba/test_mamba_mixer2.py \
-  --ignore=kernels/mamba/test_causal_conv1d.py \
-  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
-fi
-
-if [[ $commands == *" kernels/moe"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/moe/test_moe.py \
-  --ignore=kernels/moe/test_cutlass_moe.py \
-  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+  --ignore=kernels/test_attention_selector.py \
+  --ignore=kernels/test_blocksparse_attention.py \
+  --ignore=kernels/test_causal_conv1d.py \
+  --ignore=kernels/test_cutlass.py \
+  --ignore=kernels/test_encoder_decoder_attn.py \
+  --ignore=kernels/test_flash_attn.py \
+  --ignore=kernels/test_flashinfer.py \
+  --ignore=kernels/test_int8_quant.py \
+  --ignore=kernels/test_machete_gemm.py \
+  --ignore=kernels/test_mamba_ssm.py \
+  --ignore=kernels/test_marlin_gemm.py \
+  --ignore=kernels/test_moe.py \
+  --ignore=kernels/test_prefix_prefill.py \
+  --ignore=kernels/test_rand.py \
+  --ignore=kernels/test_sampler.py \
+  --ignore=kernels/test_cascade_flash_attn.py \
+  --ignore=kernels/test_mamba_mixer2.py \
+  --ignore=kernels/test_aqlm.py \
+  --ignore=kernels/test_machete_mm.py \
+  --ignore=kernels/test_mha_attn.py \
+  --ignore=kernels/test_block_fp8.py \
+  --ignore=kernels/test_permute_cols.py"
 fi

 #ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_chat.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
-  --ignore=entrypoints/openai/test_lora_adapters.py \
-  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-  --ignore=entrypoints/openai/test_root_path.py \
-  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi

 #ignore certain Entrypoints/llm tests
-if [[ $commands == *" entrypoints/llm "* ]]; then
-  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
-  --ignore=entrypoints/llm/test_chat.py \
-  --ignore=entrypoints/llm/test_accuracy.py \
-  --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
-  --ignore=entrypoints/llm/test_prompt_validation.py "}
+if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 fi

-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
@ -169,10 +134,9 @@ if [[ $commands == *"--shard-id="* ]]; then
    # assign shard-id for each shard
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    echo "Shard ${GPU} commands:$commands_gpu"
-    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
    docker run \
-        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-        --network=host \
+        --device /dev/kfd --device /dev/dri \
+        --network host \
        --shm-size=16gb \
        --rm \
        -e HIP_VISIBLE_DEVICES="${GPU}" \
@ -199,10 +163,9 @@ if [[ $commands == *"--shard-id="* ]]; then
    fi
  done
 else
-  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-          --network=host \
+          --device /dev/kfd --device /dev/dri \
+          --network host \
          --shm-size=16gb \
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -5,8 +5,8 @@
 set -ex
 set -o pipefail

-# cd 2 levels into the working directory
-cd "$(dirname "${BASH_SOURCE[0]}")/../.."
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."

 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

--- a/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-docker build -t cpu-test -f docker/Dockerfile.s390x .
+docker build -t cpu-test -f Dockerfile.ppc64le .
+
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -8,19 +8,15 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}

+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
+
 # Setup cleanup
-remove_docker_container() { 
-    set -e; 
-    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
-    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
-}
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
-
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@ -40,8 +36,8 @@ function cpu_tests() {
  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
+    pip install -r vllm/requirements/test.txt
+    pip install -r vllm/requirements/cpu.txt
    pytest -v -s tests/models/decoder_only/language -m cpu_model
    pytest -v -s tests/models/embedding/language -m cpu_model
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@ -9,13 +9,11 @@ python3 use_existing_torch.py

 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
-  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
-  --build-arg RUN_WHEEL_CHECK=false \
  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"

@ -25,6 +23,6 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and test offline inference
-docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -5,7 +5,7 @@
 set -ex

 # Try building the docker image
-docker build -t hpu-test-env -f docker/Dockerfile.hpu .
+docker build -t hpu-test-env -f Dockerfile.hpu .

 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@ -3,7 +3,7 @@
 set -euox pipefail

 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi

--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -35,7 +35,7 @@ else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi

-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .

 # Setup cleanup
 remove_docker_container() {
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+# This script build the OpenVINO docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t openvino-test -f Dockerfile.openvino .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f openvino-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -8,15 +8,14 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t ${image_name} -f docker/Dockerfile.xpu .
+docker build -t ${image_name} -f Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() { 
-  docker rm -f "${container_name}" || true; 
-  docker image rm -f "${image_name}" || true;
-  docker system prune -f || true;
+  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
 }
 trap remove_docker_container EXIT
+remove_docker_container

 # Run the image and test offline inference/tensor parallel
 docker run \
@ -26,6 +25,6 @@ docker run \
    --name "${container_name}" \
    "${image_name}" \
    sh -c '
-    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -1,45 +0,0 @@
-#!/bin/bash
-
-# This script build the CPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Setup cleanup
-remove_docker_container() {
-  if [[ -n "$container_id" ]]; then
-      podman rm -f "$container_id" || true
-  fi
-  podman system prune -f
-}
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Try building the docker image
-podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
-
-# Run the image
-container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
-
-function cpu_tests() {
-
-  # offline inference
-  podman exec -it "$container_id" bash -c "
-    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
-
-  # Run basic model test
-  podman exec -it "$container_id" bash -c "
-    set -e
-    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
-    pip install sentence-transformers datamodel_code_generator
-    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
-}
-
-# All of CPU tests are expected to be finished less than 40 mins.
-
-export container_id
-export -f cpu_tests
-timeout 40m bash -c cpu_tests
-
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -1,103 +0,0 @@
-#!/bin/bash
-
-set -xu
-
-# Build the docker image.
-docker build -f docker/Dockerfile.tpu -t vllm-tpu .
-
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
-trap remove_docker_container EXIT
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# For HF_TOKEN.
-source /etc/environment
-# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest pytest-asyncio tpu-info \
-    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && export VLLM_XLA_CACHE_PATH= \
-    && export VLLM_USE_V1=1 \
-    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
-    && echo HARDWARE \
-    && tpu-info \
-    && { \
-        echo TEST_0: Running test_perf.py; \
-        pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
-        echo TEST_0_EXIT_CODE: \$?; \
-    } & \
-    && { \
-        echo TEST_1: Running test_compilation.py; \
-        pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
-        echo TEST_1_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_2: Running test_basic.py; \
-        pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
-        echo TEST_2_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
-        pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
-        echo TEST_3_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_4: Running test_quantization_accuracy.py; \
-        pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
-        echo TEST_4_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_5: Running examples/offline_inference/tpu.py; \
-        python3 /workspace/vllm/examples/offline_inference/tpu.py; \
-        echo TEST_5_EXIT_CODE: \$?; \
-    } & \
-    { \
-        echo TEST_6: Running test_tpu_model_runner.py; \
-        pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
-        echo TEST_6_EXIT_CODE: \$?; \
-    } & \
-    && { \
-        echo TEST_7: Running test_sampler.py; \
-        pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
-        echo TEST_7_EXIT_CODE: \$?; \
-    } & \
-    && { \
-        echo TEST_8: Running test_topk_topp_sampler.py; \
-        pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
-        echo TEST_8_EXIT_CODE: \$?; \
-    } & \
-    && { \
-        echo TEST_9: Running test_multimodal.py; \
-        pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
-        echo TEST_9_EXIT_CODE: \$?; \
-    } & \
-    && { \
-        echo TEST_10: Running test_pallas.py; \
-        pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
-        echo TEST_10_EXIT_CODE: \$?; \
-    } & \
-    && { \
-        echo TEST_11: Running test_struct_output_generate.py; \
-        pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
-        echo TEST_11_EXIT_CODE: \$?; \
-    } & \
-    && { \
-        echo TEST_12: Running test_moe_pallas.py; \
-        pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
-        echo TEST_12_EXIT_CODE: \$?; \
-    } & \
-    # Disable the TPU LoRA tests until the feature is activated
-    # && { \
-    #     echo TEST_13: Running test_moe_pallas.py; \
-    #     pytest -s -v /workspace/vllm/tests/tpu/lora/; \
-    #     echo TEST_13_EXIT_CODE: \$?; \
-    # } & \
-    wait \
-    && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
-"
-
-# TODO: This test fails because it uses RANDOM_SEED sampling
-# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -8,7 +8,6 @@
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
@ -39,7 +38,7 @@ steps:
  - pip install -r ../../requirements/docs.txt
  - SPHINXOPTS=\"-W\" make html
  # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
+  - grep \"sig sig-object py\" build/html/api/inference_params.html

 - label: Async Engine, Inputs, Utils, Worker Test # 24min
  source_file_dependencies:
@ -71,7 +70,6 @@ steps:
 - label: Basic Correctness Test # 30min
  #mirror_hardwares: [amd]
  fast_check: true
-  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
@ -106,8 +104,7 @@ steps:
 - label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
-  torch_nightly: true
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
@ -121,7 +118,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

@ -138,14 +135,8 @@ steps:
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/test_async_llm_dp.py
  commands:
-  # test with tp=2 and external_dp=2
-  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
@ -158,7 +149,6 @@ steps:
  - popd

 - label: Metrics, Tracing Test # 10min
-  mirror_hardwares: [amd]
  num_gpus: 2
  source_file_dependencies:
  - vllm/
@ -166,13 +156,18 @@ steps:
  - tests/tracing
  commands:
  - pytest -v -s metrics
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0,<1.27.0' \
+      'opentelemetry-api>=1.26.0,<1.27.0' \
+      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
  - pytest -v -s tracing

 ##### fast check tests  #####
 #####  1 GPU test  #####

 - label: Regression Test # 5min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@ -203,13 +198,11 @@ steps:
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
-    - pytest -v -s v1/engine
    - pytest -v -s v1/entrypoints
+    - pytest -v -s v1/engine
    - pytest -v -s v1/sample
    - pytest -v -s v1/worker
    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/spec_decode
-    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_stats.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
@ -285,25 +278,14 @@ steps:
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py

 - label: LoRA Test %N # 15min each
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
  parallelism: 4

- label: PyTorch Compilation Unit Tests
-  torch_nightly: true
-  source_file_dependencies:
-    - vllm/
-    - tests/compile
-  commands:
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_sequence_parallelism.py
-
 - label: PyTorch Fullgraph Smoke Test # 9min
-  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@ -314,61 +296,24 @@ steps:
  - pytest -v -s compile/piecewise/test_toy_llama.py

 - label: PyTorch Fullgraph Test # 18min
-  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py

- label: Kernels Core Operation Test
+- label: Kernels Test %N # 1h each
  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
-  - tests/kernels/core
-  commands:
-    - pytest -v -s kernels/core
-
- label: Kernels Attention Test %N
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - csrc/attention/
  - vllm/attention
-  - vllm/v1/attention
-  - tests/kernels/attention
+  - tests/kernels
  commands:
-    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
-
- label: Kernels Quantization Test %N
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - csrc/quantization/
-  - vllm/model_executor/layers/quantization
-  - tests/kernels/quantization
-  commands:
-    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
-
- label: Kernels MoE Test
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - csrc/moe/
-  - tests/kernels/moe
-  - vllm/model_executor/layers/fused_moe/
-  commands:
-    - pytest -v -s kernels/moe
-
- label: Kernels Mamba Test
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - csrc/mamba/
-  - tests/kernels/mamba
-  commands:
-    - pytest -v -s kernels/mamba
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4

 - label: Tensorizer Test # 11min
-  # mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
@ -384,22 +329,14 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
-  - bash scripts/run-benchmarks.sh
+  - bash run-benchmarks.sh

- label: Benchmarks CLI Test # 10min
-  source_file_dependencies:
-  - vllm/
-  - tests/benchmarks/
-  commands:
-  - pytest -v -s benchmarks/
-
- label: Quantization Test
+- label: Quantization Test # 33min
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
-  commands:
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@ -408,7 +345,7 @@ steps:
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1

 - label: OpenAI API correctness
  source_file_dependencies:
@ -427,101 +364,106 @@ steps:

 - label: OpenAI-Compatible Tool Use # 20 min
  fast_check: false
-  #mirror_hardwares: [ amd ]
+  mirror_hardwares: [ amd ]
  source_file_dependencies:
    - vllm/
    - tests/tool_use
-    - tests/mistral_tool_use
  commands:
    - pytest -v -s tool_use
-    - pytest -v -s mistral_tool_use

 #####  models test  #####

 - label: Basic Models Test # 24min
-  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
-    - pytest -v -s models/test_utils.py
-    - pytest -v -s models/test_vision.py
    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py

- label: Language Models Test (Standard)
+- label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
-  - tests/models/language
+  - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
-    - pytest -v -s models/language -m core_model
+    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/language -m core_model

- label: Language Models Test (Extended)
+- label: Language Models Test (Extended) # 1h10min
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/language
+  - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
-    - pytest -v -s models/language -m 'not core_model'
+    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/language -m 'not core_model'

- label: Multi-Modal Models Test (Standard)
+- label: Multi-Modal Models Test (Standard) # 40min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
-  - tests/models/multimodal
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/audio_language
+  - tests/models/encoder_decoder/vision_language
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
-    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
-    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+    - pytest -v -s models/multimodal
+    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/vision_language -m core_model
+    - pytest -v -s models/encoder_decoder/audio_language -m core_model
+    - pytest -v -s models/encoder_decoder/language -m core_model
+    - pytest -v -s models/encoder_decoder/vision_language -m core_model

- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 48m
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/multimodal
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/vision_language
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
+    # HACK - run phi3v tests separately to sidestep this transformers bug
+    # https://github.com/huggingface/transformers/issues/34307
+    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/vision_language -m 'not core_model'
+    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
+    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'

- label: Multi-Modal Models Test (Extended) 2
+- label: Multi-Modal Models Test (Extended) 2 # 38m
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/multimodal
+  - tests/models/decoder_only/vision_language
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'

- label: Multi-Modal Models Test (Extended) 3
-  optional: true
+- label: SSM and Hybrid Models Test # 12min
  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
+      - vllm/
+      - tests/models/decoder_only/language/test_hybrid.py
+      - tests/models/decoder_only/language/test_mamba.py
  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
-
- label: Quantized Models Test
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/model_executor/layers/quantization
-  - tests/models/quantization
-  commands:
-    - pytest -v -s models/quantization
+      - pytest -v -s models/decoder_only/language/test_hybrid.py
+      - pytest -v -s models/decoder_only/language/test_mamba.py

 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
-  mirror_hardwares: [amd]
  optional: true
  commands:
    - echo 'Testing custom models...'
@ -533,7 +475,6 @@ steps:
 #####  multi gpus test  #####

 - label: Distributed Comm Ops Test # 7min
-  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -576,27 +517,24 @@ steps:
  - vllm/worker/worker.py
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
-  - tests/v1/test_async_llm_dp.py
-  - vllm/v1/engine/
  commands:
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
+  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
-  # test sequence parallel
-  - pytest -v -s distributed/test_sequence_parallel.py
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown

 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
@ -659,10 +597,14 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # This test runs llama 13B, so it is required to run on 4 GPUs.
+    - pytest -v -s -x lora/test_long_context.py
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py
+    - pytest -v -s -x lora/test_transfomers_model.py


 - label: Weight Loading Multiple GPU Test  # 33min
@ -713,4 +655,4 @@ steps:
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu121"* ]]; then
+    # if $normal_wheel matches cu121, do not upload the index.html
+    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu124 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@ -66,12 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu121"* ]]; then
+    # if $normal_wheel matches cu121, do not upload the index.html
+    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu124 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi

-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -12,7 +12,6 @@
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
 /vllm/model_executor/guided_decoding @mgoin @russellb
 /vllm/multimodal @DarkLight1337 @ywang96
-/vllm/vllm_flash_attn @LucasWilkinson
 CMakeLists.txt @tlrmchlsmth

 # vLLM V1
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -14,19 +14,19 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      <details>
-      <summary>The output of <code>python collect_env.py</code></summary>
+      <summary>The output of `python collect_env.py`</summary>

      ```text
      Your output of `python collect_env.py` here
      ```
-
+      
      </details>
  validations:
    required: true
@ -75,7 +75,7 @@ body:
      ```

      ```
-      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
+      The error message you got, with the full traceback.
      ```
  validations:
    required: true
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).

-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@ -35,7 +35,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
@ -0,0 +1,28 @@
+name: 🎲 Misc/random discussions that do not fit into the above categories.
+description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
+title: "[Misc]: "
+labels: ["misc"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Anything you want to discuss about vllm.
+    description: >
+      Anything you want to discuss about vllm.
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,5 +1 @@
 blank_issues_enabled: false
-contact_links:
-  - name: Questions
-    url: https://discuss.vllm.ai
-    about: Ask questions and discuss with other vLLM community members
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)

 <!--- pyml disable-next-line no-emphasis-as-heading -->
-**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -19,7 +19,7 @@ pull_request_rules:
      - files~=\.buildkite/
      - files~=^cmake/
      - files=CMakeLists.txt
-      - files~=^docker/Dockerfile
+      - files~=^Dockerfile
      - files~=^requirements.*\.txt
      - files=setup.py
  actions:
@ -55,19 +55,11 @@ pull_request_rules:
  description: Automatically apply structured-output label
  conditions:
    - or:
-      - files~=^benchmarks/structured_schemas/
-      - files=benchmarks/benchmark_serving_structured_output.py
-      - files=benchmarks/run_structured_output_benchmark.sh
-      - files=docs/source/features/structured_outputs.md
-      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^vllm/model_executor/guided_decoding/
      - files=tests/model_executor/test_guided_processors.py
      - files=tests/entrypoints/llm/test_guided_generate.py
-      - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_guided_generate.py
-      - files~=^vllm/v1/structured_output/
+      - files=benchmarks/benchmark_serving_guided.py
+      - files=benchmarks/benchmark_guided.py
  actions:
    label:
      add:
@ -96,58 +88,6 @@ pull_request_rules:
      add:
        - v1

- name: label-tpu
-  description: Automatically apply tpu label
-  # Keep this list in sync with `label-tpu-remove` conditions
-  conditions:
-    - or:
-      - files~=tpu.py
-      - files~=_tpu
-      - files~=tpu_
-      - files~=/tpu/
-      - files~=pallas
-  actions:
-    label:
-      add:
-        - tpu
-
- name: label-tpu-remove
-  description: Automatically remove tpu label
-  # Keep this list in sync with `label-tpu` conditions
-  conditions:
-    - and:
-      - -files~=tpu.py
-      - -files~=_tpu
-      - -files~=tpu_
-      - -files~=/tpu/
-      - -files~=pallas
-  actions:
-    label:
-      remove:
-        - tpu
-
- name: label-tool-calling
-  description: Automatically add tool-calling label
-  conditions:
-    - or:
-      - files~=^tests/tool_use/
-      - files~=^tests/mistral_tool_use/
-      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
-      - files~=^vllm/entrypoints/openai/tool_parsers/
-      - files=docs/source/features/tool_calling.md
-      - files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
-      - files=docs/source/getting_started/examples/chat_with_tools.md
-      - files~=^examples/tool_chat_*
-      - files=examples/offline_inference/chat_with_tools.py
-      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
-      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
-      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
-  actions:
-    label:
-      add:
-        - tool-calling
-
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
      - conflict
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -50,7 +50,7 @@ jobs:
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0

      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .

      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
@ -66,7 +66,7 @@ jobs:
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
-
+    
      - name: curl test
        run: |
          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
@ -79,4 +79,4 @@ jobs:
                          "max_tokens": 7,
                          "temperature": 0
                  }'):$CODE"
-          echo "$CODE"
+          echo "$CODE"
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,7 @@
 /vllm/_version.py

 # vllm-flash-attn built from source
-vllm/vllm_flash_attn/*
+vllm/vllm_flash_attn/

 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -80,7 +80,6 @@ instance/
 # Sphinx documentation
 docs/_build/
 docs/source/getting_started/examples/
-docs/source/api/vllm

 # PyBuilder
 .pybuilder/
@ -203,6 +202,3 @@ benchmarks/**/*.json
 # Linting
 actionlint
 shellcheck*/
-
-# Ingore moe/marlin_moe gen code
-csrc/moe/marlin_moe_wna16/kernel_*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,6 +1,3 @@
-default_install_hook_types:
-  - pre-commit
-  - commit-msg
 default_stages:
  - pre-commit # Run locally
  - manual # Run in CI
@ -11,30 +8,31 @@ repos:
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
+    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.11.7
+  rev: v0.9.3
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.4.1
+  rev: v2.4.0
  hooks:
  - id: codespell
    additional_dependencies: ['tomli']
    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
+  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v20.1.3
+  rev: v19.1.7
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
 - repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.29
+  rev: v0.9.27
  hooks:
  - id: pymarkdown
    args: [fix]
@ -43,10 +41,10 @@ repos:
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.17
+  rev: 0.6.2
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+      args: [requirements/test.in, -o, requirements/test.txt]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
@ -101,8 +99,8 @@ repos:
    args:
      - -c
      - |
-        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
-          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
        fi
    language: system
    verbose: true
@ -121,10 +119,6 @@ repos:
    language: system
    always_run: true
    pass_filenames: false
-  - id: update-dockerfile-graph
-    name: Update Dockerfile dependency graph
-    entry: tools/update-dockerfile-graph.sh
-    language: script
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)

 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
+
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

@ -33,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -43,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from docker/Dockerfile.rocm
+# versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")

 #
 # Try to find python package with an executable that exactly matches
@ -229,28 +230,25 @@ set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
-  "csrc/attention/merge_attn_states.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
-  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
-  "csrc/quantization/activation_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
-  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

-  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
+  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
+  # Please keep this in sync with FetchContent_Declare line below.
+  set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -268,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
        # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG ${CUTLASS_REVISION}
+        GIT_TAG v3.8.0
        GIT_PROGRESS TRUE

        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -284,13 +282,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/custom_all_reduce.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
-    "csrc/cutlass_extensions/common.cpp"
-    "csrc/attention/mla/cutlass_mla_entry.cu")
+    "csrc/cutlass_extensions/common.cpp")

  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -301,52 +299,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # are not supported by Machete yet.
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
-
-    #
-    # For the Marlin kernels we automatically generate sources for various
-    # preselected input type pairs and schedules.
-    # Generate sources:
-    set(MARLIN_GEN_SCRIPT
-      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
-    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
-
-    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
-
-    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
-      execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
-        RESULT_VARIABLE marlin_generation_result
-        OUTPUT_VARIABLE marlin_generation_result
-        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
-        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
-      )
-
-      if (NOT marlin_generation_result EQUAL 0)
-        message(FATAL_ERROR "Marlin generation failed."
-                            " Result: \"${marlin_generation_result}\""
-                            "\nCheck the log for details: "
-                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
-      else()
-        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run Marlin generate script hash" FORCE)
-        message(STATUS "Marlin generation completed successfully.")
-      endif()
-    else()
-      message(STATUS "Marlin generation script has not changed, skipping generation.")
-    endif()
-
-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
-
-    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
-
    set(MARLIN_SRCS
+       "csrc/quantization/fp8/fp8_marlin.cu"
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
@ -418,7 +372,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -508,52 +461,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(FP4_ARCHS)
  endif()

-  # CUTLASS MLA Archs and flags
-  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
-    set(SRCS
-      "csrc/attention/mla/cutlass_mla_kernels.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${MLA_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
-    # Add MLA-specific include directories only to MLA source files
-    set_source_files_properties(${SRCS}
-      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
-    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
-  else()
-    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
-    # clear MLA_ARCHS
-    set(MLA_ARCHS)
-  endif()
-
-  # CUTLASS MoE kernels
-
-  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
-  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
-  # to compile MoE kernels that use its output.
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
-             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
-    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
-                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
-                     "if you intend on running FP8 quantized MoE models on Hopper.")
-    else()
-      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-  endif()
-
  #
  # Machete kernels

@ -673,51 +580,21 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
+    set(MARLIN_MOE_SRC
+        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
+        "csrc/moe/marlin_moe_ops.cu")

-    #
-    # For the Marlin MOE kernels we automatically generate sources for various
-    # preselected input type pairs and schedules.
-    # Generate sources:
-    set(MOE_MARLIN_GEN_SCRIPT
-      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
-    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
-
-    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
-
-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
-      execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
-        RESULT_VARIABLE moe_marlin_generation_result
-        OUTPUT_VARIABLE moe_marlin_generation_output
-        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
-        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
-      )
-
-      if (NOT moe_marlin_generation_result EQUAL 0)
-        message(FATAL_ERROR "Marlin MOE generation failed."
-                            " Result: \"${moe_marlin_generation_result}\""
-                            "\nCheck the log for details: "
-                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
-      else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
-        message(STATUS "Marlin MOE generation completed successfully.")
-      endif()
-    else()
-      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
-    endif()
-
-    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
    set_gencode_flags_for_srcs(
-      SRCS "${MOE_WNAA16_MARLIN_SRC}"
+      SRCS "${MARLIN_MOE_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")

-    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
-
+    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
@ -725,17 +602,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
 endif()

-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  set(MOE_PERMUTE_SRC
-      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
-      "csrc/moe/moe_permute_unpermute_op.cu")
-
-  set_gencode_flags_for_srcs(
-    SRCS "${MARLIN_PERMUTE_SRC}"
-    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
-
-  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
-endif()
 message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
@ -744,8 +610,6 @@ define_gpu_extension_target(
  SOURCES ${VLLM_MOE_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
-  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)

@ -755,7 +619,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  set(VLLM_ROCM_EXT_SRC
    "csrc/rocm/torch_bindings.cpp"
-    "csrc/rocm/skinny_gemms.cu"
    "csrc/rocm/attention.cu")

  define_gpu_extension_target(
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -5,39 +5,30 @@
 # docs/source/contributing/dockerfile/dockerfile.md and
 # docs/source/assets/contributing/dockerfile-stages-dependency.png

-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive

-# Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo \
-    && for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-    done \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+# Install minimal dependencies and uv
+RUN apt-get update -y \
+    && apt-get install -y ccache git curl wget sudo \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Add uv to PATH
+ENV PATH="/root/.local/bin:$PATH"
+# Create venv with specified Python and activate by placing at the front of path
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"

 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
@ -55,23 +46,19 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

 WORKDIR /workspace

-# install build and runtime dependencies
-
 # arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
    fi

 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/cuda.txt \
-    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    uv pip install -r requirements/cuda.txt

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -94,11 +81,9 @@ COPY requirements/build.txt requirements/build.txt
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt \
-    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    uv pip install -r requirements/build.txt

 COPY . .
 ARG GIT_REPO_CHECK=0
@ -165,25 +150,19 @@ FROM base as dev
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-
-# Workaround for #17068
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"

 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt \
-    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    uv pip install -r requirements/dev.txt
 #################### DEV IMAGE ####################

 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 # TODO: Restore to base image after FlashInfer AOT wheel fixed
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
@ -192,31 +171,22 @@ ARG TARGETPLATFORM
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

-# Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-    done \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+# Install minimal dependencies and uv
+RUN apt-get update -y \
+    && apt-get install -y ccache git curl wget sudo vim \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Add uv to PATH
+ENV PATH="/root/.local/bin:$PATH"
+# Create venv with specified Python and activate by placing at the front of path
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -230,15 +200,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
    fi

 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose \
-    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    uv pip install dist/*.whl --verbose

 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@ -253,19 +221,10 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl

 RUN --mount=type=cache,target=/root/.cache/uv \
-. /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    # TESTING: install FlashInfer from source to test 2.7.0 final RC
-    FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \
-    uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
-COPY benchmarks benchmarks
-COPY ./vllm/collect_env.py .
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-. /etc/environment && \
-uv pip list

 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
@ -273,8 +232,7 @@ uv pip list
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt \
-    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    uv pip install -r requirements/build.txt

 #################### vLLM installation IMAGE ####################

@ -288,23 +246,18 @@ ADD . /vllm-workspace/
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-
-# Workaround for #17068
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt
+    uv pip install -r requirements/dev.txt

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -e tests/vllm_test_utils
+    uv pip install -e tests/vllm_test_utils

 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
+    uv pip install hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1

 # Copy in the v1 package for testing (it isn't distributed yet)
@ -321,7 +274,6 @@ RUN mv vllm test_docs/
 #################### OPENAI API SERVER ####################
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
-ARG TARGETPLATFORM

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -330,9 +282,9 @@ ENV UV_HTTP_TIMEOUT=500
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    fi

 ENV VLLM_USAGE_SOURCE production-docker-image
--- a/docker/Dockerfile.arm
+++ b/docker/Dockerfile.arm
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -0,0 +1,69 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-1
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
+# intel-openmp provides additional performance improvement vs. openmp
+# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install intel-openmp==2025.0.1
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+RUN pip install intel_extension_for_pytorch==2.6.0
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements/build.txt
+
+FROM cpu-test-1 AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    pip install -v -r requirements/cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -e tests/vllm_test_utils
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest

 COPY ./ /workspace/vllm

--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"

 FROM $BASE_IMAGE

@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm

 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
-RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
-RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
+RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install pytest

 # uninstall transformers-neuronx package explicitly to avoid version conflict
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -0,0 +1,29 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+FROM ubuntu:22.04 AS dev
+
+RUN apt-get update -y && \
+    apt-get install -y \
+        git python3-pip \
+        ffmpeg libsm6 libxext6 libgl1
+WORKDIR /workspace
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN python3 -m pip install -U pip
+# install build requirements
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
+# build vLLM with OpenVINO backend
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
+
+COPY examples/ /workspace/examples
+COPY benchmarks/ /workspace/benchmarks
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -0,0 +1,37 @@
+FROM mambaorg/micromamba
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+USER root
+
+ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
+
+RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
+
+# Some packages in requirements/cpu are installed here
+# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
+# Currently these may not be available for venv or pip directly
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
+RUN --mount=type=cache,target=/root/.cache/pip  \
+    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        -r requirements/cpu.txt \
+        xformers uvloop==0.20.0
+
+RUN --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -12,8 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}

 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
-    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
-    apt-transport-https ca-certificates wget curl
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
 # Remove sccache    
 RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
@ -41,7 +40,7 @@ ARG USE_CYTHON
 RUN cd vllm \
    && python3 -m pip install -r requirements/rocm.txt \
    && python3 setup.py clean --all  \
-    && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
+    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
    && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
@ -114,16 +113,8 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false

-# ENV that can improve safe tensor loading, and end-to-end time
-ENV SAFETENSORS_FAST_GPU=1
-
-# User-friendly environment setting for multi-processing to avoid below RuntimeError.
-# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
-# you must use the 'spawn' start method 
-# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1

 CMD ["/bin/bash"]
+
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,26 +1,24 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="db8e93b4"
+ARG HIPBLASLT_BRANCH="4d40e36"
 ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 ARG LEGACY_HIPBLASLT_OPTION=
 ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="295f2ed4"
-ARG PYTORCH_VISION_BRANCH="v0.21.0"
+ARG PYTORCH_BRANCH="3a585126"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="1a7f4dfa"
-ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="5a77249"
-ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+ARG FA_BRANCH="b7d29fb"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"

 FROM ${BASE_IMAGE} AS base

 ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}

 ARG PYTHON_VERSION=3.12
@ -31,11 +29,8 @@ ENV DEBIAN_FRONTEND=noninteractive

 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
-    && for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-    done \
+    && apt-get install -y software-properties-common git curl sudo vim less \
+    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
@ -45,7 +40,7 @@ RUN apt-get update -y \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

-RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython

 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@ -63,8 +58,7 @@ RUN cd hipBLAS-common \
 RUN git clone https://github.com/ROCm/hipBLASLt
 RUN cd hipBLASLt \
    && git checkout ${HIPBLASLT_BRANCH} \
-    && apt-get install -y llvm-dev \
-    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
    && cd build/release \
    && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@ -114,24 +108,11 @@ RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
-    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/flash-attention/dist/*.whl /app/install

-FROM base AS build_aiter
-ARG AITER_BRANCH
-ARG AITER_REPO
-RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
-    pip install /install/*.whl
-RUN git clone --recursive ${AITER_REPO}
-RUN cd aiter \
-    && git checkout ${AITER_BRANCH} \
-    && git submodule update --init --recursive \
-    && pip install -r requirements.txt
-RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
-RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
-
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
@ -147,11 +128,8 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
-RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
-    pip install /install/*.whl

 ARG BASE_IMAGE
-ARG HIPBLAS_COMMON_BRANCH
 ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
@ -164,8 +142,6 @@ ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
-ARG AITER_BRANCH
-ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@ -179,5 +155,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
 RUN microdnf install -y \
    which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
-    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
    microdnf clean all

 # Python Installation
@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    cd ../../python && \
    export PYARROW_PARALLEL=4 && \
    export ARROW_BUILD_TYPE=release && \
-    uv pip install -r requirements-build.txt && \
+    uv pip install -r requirements/build.txt && \
    python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel

 FROM python-install AS numa-build
@ -96,22 +96,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
    python setup.py bdist_wheel

-FROM python-install AS hf-xet-builder
-# Install hf-xet
-WORKDIR /tmp
-ENV CARGO_HOME=/root/.cargo
-ENV RUSTUP_HOME=/root/.rustup
-ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
-    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
-    git clone https://github.com/huggingface/xet-core.git && \
-    cd xet-core/hf_xet/ && \
-    uv pip install maturin patchelf && \
-    python -m maturin build --release --out dist && \
-    mkdir -p /tmp/hf-xet/dist && \
-    cp dist/*.whl /tmp/hf-xet/dist/
-
 # Final build stage
 FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
@ -123,7 +107,6 @@ ENV UV_LINK_MODE=copy
 ENV CARGO_HOME=/root/.cargo
 ENV RUSTUP_HOME=/root/.rustup
 ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1

 COPY . /workspace/vllm
 WORKDIR /workspace/vllm
@ -137,15 +120,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
    --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
    --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
-    --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
     sed -i '/^torch/d' requirements/build.txt && \
     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
-     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
    uv pip install -v \    
        $ARROW_WHL_FILE  \
        $VISION_WHL_FILE \
-        $HF_XET_WHL_FILE \
        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
        --index-strategy unsafe-best-match \
        -r requirements/build.txt \
@ -169,5 +149,4 @@ USER 2000
 WORKDIR /home/vllm

 # Set the default entrypoint
-ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
-
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
    python3 -m pip install \
        -r requirements/tpu.txt
-RUN python3 -m pip install -e .
+RUN python3 setup.py develop

 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@ -1,7 +1,11 @@
-# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
-FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
+FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base

-RUN rm /etc/apt/sources.list.d/intel-graphics.list
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+    chmod 644 /usr/share/keyrings/intel-graphics.gpg

 RUN apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
@ -17,6 +21,8 @@ RUN apt-get update -y && \
    python3 \
    python3-dev \
    python3-pip \
+    libze-intel-gpu-dev \
+    libze-intel-gpu1 \
    wget

 WORKDIR /workspace/vllm
@ -40,6 +46,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
    python3 setup.py install

+# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
+# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install intel-extension-for-pytorch==2.6.10+xpu \
+    --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+
 CMD ["/bin/bash"]

 FROM vllm-base AS vllm-openai
--- a/README.md
+++ b/README.md
@ -10,24 +10,15 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

---
-
 *Latest News* 🔥
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

-<details>
-<summary>Previous News</summary>
-
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
+- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
@ -43,9 +34,8 @@ Easy, fast, and cheap LLM serving for everyone
 - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).

-</details>
-
 ---
+
 ## About

 vLLM is a fast and easy-to-use library for LLM inference and serving.
@ -100,7 +90,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
 ## Contributing

 We welcome and value any contributions and collaborations.
-Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
+Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.

 ## Sponsors

@ -123,7 +113,6 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
@ -154,11 +143,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 ## Contact Us

- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
+- For technical questions and feature requests, please use GitHub issues or discussions.
+- For discussing with fellow users and coordinating contributions and development, please use Slack.
+- For security disclosures, please use GitHub's security advisory feature.
+- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.

 ## Media Kit

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -41,39 +41,29 @@ become available.
      <td><code>synthetic</code></td>
    </tr>
    <tr>
-      <td><strong>HuggingFace-VisionArena</strong></td>
+      <td><strong>HuggingFace</strong></td>
      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>lmarena-ai/VisionArena-Chat</code></td>
+      <td style="text-align: center;">🟡</td>
+      <td>Specify your dataset path on HuggingFace</td>
    </tr>
    <tr>
-      <td><strong>HuggingFace-InstructCoder</strong></td>
+      <td><strong>VisionArena</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
-      <td><code>likaixin/InstructCoder</code></td>
-    </tr>
-      <tr>
-      <td><strong>HuggingFace-AIMO</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
-    </tr>
-    <tr>
-      <td><strong>HuggingFace-Other</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
+      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
    </tr>
  </tbody>
 </table>

 ✅: supported

-🟡: Partial support
-
 🚧: to be supported

-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
+formats, please consider contributing.
+
+**Note**: VisionArena’s `dataset-name` should be set to `hf`

 ---
 ## Example - Online Benchmark
@ -81,7 +71,8 @@ become available.
 First start serving your model

 ```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+vllm serve ${MODEL_NAME} --disable-log-requests
 ```

 Then run the benchmarking script
@ -89,13 +80,12 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --endpoint /v1/completions \
-  --dataset-name sharegpt \
-  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --num-prompts 10
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+NUM_PROMPTS=10
+BACKEND="vllm"
+DATASET_NAME="sharegpt"
+DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
 ```

 If successful, you will see the following output
@ -132,105 +122,37 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```

 ```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT='train'
+
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path lmarena-ai/VisionArena-Chat \
-  --hf-split train \
-  --num-prompts 1000
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
-    --speculative-model "[ngram]" \
-    --ngram_prompt_lookup_min 2 \
-    --ngram-prompt-lookup-max 5 \
-    --num_speculative_tokens 5
-```
-
-``` bash
-python3 benchmarks/benchmark_serving.py \
-    --model meta-llama/Meta-Llama-3-8B-Instruct \
-    --dataset-name hf \
-    --dataset-path likaixin/InstructCoder \
-    --num-prompts 2048
-```
-
-### Other HuggingFaceDataset Examples
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
-```
-
-**`lmms-lab/LLaVA-OneVision-Data`**
-
-```bash
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path lmms-lab/LLaVA-OneVision-Data \
-  --hf-split train \
-  --hf-subset "chart2text(cauldron)" \
-  --num-prompts 10
-```
-
-**`Aeala/ShareGPT_Vicuna_unfiltered`**
-
-```bash
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
-  --hf-split train \
-  --num-prompts 10
-```
-
-**`AI-MO/aimo-validation-aime`**
-
-``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
-    --model Qwen/QwQ-32B \
-    --dataset-name hf \
-    --dataset-path AI-MO/aimo-validation-aime \
-    --num-prompts 10 \
-    --seed 42
-```
-
-### Running With Sampling Parameters
-
-When using OpenAI-compatible backends such as `vllm`, optional sampling
-parameters can be specified. Example client command:
-
-```bash
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --endpoint /v1/completions \
-  --dataset-name sharegpt \
-  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --top-k 10 \
-  --top-p 0.9 \
-  --temperature 0.5 \
-  --num-prompts 10
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}"
 ```

 ---
 ## Example - Offline Throughput Benchmark

 ```bash
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+NUM_PROMPTS=10
+DATASET_NAME="sonnet"
+DATASET_PATH="vllm/benchmarks/sonnet.txt"
+
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --dataset-name sonnet \
-  --dataset-path vllm/benchmarks/sonnet.txt \
-  --num-prompts 10
+  --model "${MODEL_NAME}" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --num-prompts "${NUM_PROMPTS}"
 ```

 If successful, you will see the following output
@ -244,13 +166,19 @@ Total num output tokens:  1500
 ### VisionArena Benchmark for Vision Language Models

 ``` bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT="train"
+
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path lmarena-ai/VisionArena-Chat \
-  --num-prompts 1000 \
-  --hf-split train
+  --model "${MODEL_NAME}" \
+  --backend "vllm-chat" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-split "${DATASET_SPLIT}"
 ```

 The `num prompt tokens` now includes image token counts
@ -261,83 +189,29 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```

-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_WORKER_MULTIPROC_METHOD=spawn \
-VLLM_USE_V1=1 \
-python3 vllm/benchmarks/benchmark_throughput.py \
-    --dataset-name=hf \
-    --dataset-path=likaixin/InstructCoder \
-    --model=meta-llama/Meta-Llama-3-8B-Instruct \
-    --input-len=1000 \
-    --output-len=100 \
-    --num-prompts=2048 \
-    --async-engine \
-    --speculative-model="[ngram]" \
-    --ngram_prompt_lookup_min=2 \
-    --ngram-prompt-lookup-max=5 \
-    --num_speculative_tokens=5
-```
-
-```
-Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
-Total num prompt tokens:  261136
-Total num output tokens:  204800
-```
-
-### Other HuggingFaceDataset Examples
-
-**`lmms-lab/LLaVA-OneVision-Data`**
-
-```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path lmms-lab/LLaVA-OneVision-Data \
-  --hf-split train \
-  --hf-subset "chart2text(cauldron)" \
-  --num-prompts 10
-```
-
-**`Aeala/ShareGPT_Vicuna_unfiltered`**
-
-```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
-  --hf-split train \
-  --num-prompts 10
-```
-
-**`AI-MO/aimo-validation-aime`**
-
-```bash
-python3 benchmarks/benchmark_throughput.py \
-  --model Qwen/QwQ-32B \
-  --backend vllm \
-  --dataset-name hf \
-  --dataset-path AI-MO/aimo-validation-aime \
-  --hf-split train \
-  --num-prompts 10
-```
-
 ### Benchmark with LoRA Adapters

 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+MODEL_NAME="meta-llama/Llama-2-7b-hf"
+BACKEND="vllm"
+DATASET_NAME="sharegpt"
+DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
+NUM_PROMPTS=10
+MAX_LORAS=2
+MAX_LORA_RANK=8
+ENABLE_LORA="--enable-lora"
+LORA_PATH="yard1/llama-2-7b-sql-lora-test"
+
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model meta-llama/Llama-2-7b-hf \
-  --backend vllm \
-  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --dataset_name sharegpt \
-  --num-prompts 10 \
-  --max-loras 2 \
-  --max-lora-rank 8 \
-  --enable-lora \
-  --lora-path yard1/llama-2-7b-sql-lora-test
+  --model "${MODEL_NAME}" \
+  --backend "${BACKEND}" \
+  --dataset_path "${DATASET_PATH}" \
+  --dataset_name "${DATASET_NAME}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --max-loras "${MAX_LORAS}" \
+  --max-lora-rank "${MAX_LORA_RANK}" \
+  ${ENABLE_LORA} \
+  --lora-path "${LORA_PATH}"
  ```
--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@ -1,212 +0,0 @@
-#!/bin/bash
-
-# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
-# The current server parameter combination is  max_num_seqs and max_num_batched_tokens
-# It also supports additional requirement: e2e latency and prefix cache. 
-
-# Pre-requisite:
-# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. 
-# 2. If the model is customized, replace the MODEL's config with the customized config.
-# 3. Set variables (ALL REQUIRED)
-#   BASE: your directory for vllm repo
-#   MODEL: the model served by vllm
-#   DOWNLOAD_DIR: directory to download and load model weights.
-#   INPUT_LEN: request input len
-#   OUTPUT_LEN: request output len
-#   MIN_CACHE_HIT_PCT: prefix cache rate
-#   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
-# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
-# 5. The final result will be saved in RESULT file. 
-
-
-# Example use cases 
-# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
-# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
-# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
-# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
-# 3. If we want to reach 60% prefix cache, what's the best server parameter? 
-# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
-
-TAG=$(date +"%Y_%m_%d_%H_%M")
-BASE=""
-MODEL="meta-llama/Llama-3.1-8B-Instruct"
-DOWNLOAD_DIR=""
-INPUT_LEN=4000
-OUTPUT_LEN=16
-MIN_CACHE_HIT_PCT_PCT=0
-MAX_LATENCY_ALLOWED_MS=100000000000
-
-LOG_FOLDER="$BASE/auto-benchmark/$TAG"
-RESULT="$LOG_FOLDER/result.txt"
-
-echo "result file$ $RESULT"
-echo "model: $MODEL"
-echo
-
-rm -rf $LOG_FOLDER
-mkdir -p $LOG_FOLDER
-
-cd "$BASE/vllm"
-# create sonnet-4x.txt so that we can sample 2048 tokens for input
-echo "" > benchmarks/sonnet_4x.txt
-for _ in {1..4}
-do
-cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
-done
-
-pip install datasets
-
-current_hash=$(git rev-parse HEAD)
-echo "hash:$current_hash" >> "$RESULT"
-echo "current_hash: $current_hash"
-
-best_throughput=0
-best_max_num_seqs=0
-best_num_batched_tokens=0
-best_goodput=0
-run_benchmark() {
-    local max_num_seqs=$1
-    local max_num_batched_tokens=$2
-    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
-    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
-    echo "vllm_log: $vllm_log"
-    echo
-    rm -f $vllm_log
-
-    # start the server
-    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
-        --disable-log-requests \
-        --port 8004 \
-        --gpu-memory-utilization 0.98 \
-        --max-num-seqs $max_num_seqs \
-        --max-num-batched-tokens $max_num_batched_tokens \
-        --tensor-parallel-size 1 \
-        --enable-prefix-caching \
-        --load-format dummy \
-        --download-dir $DOWNLOAD_DIR \
-        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
-    echo "wait for 10 minutes.."
-    echo
-    # wait for 10 minutes...
-    server_started=0
-    for i in {1..60}; do        
-        if grep -Fq "Application startup complete" "$vllm_log"; then
-            echo "Application started"
-            server_started=1
-            break
-        else
-            # echo "wait for 10 seconds..."
-            sleep 10
-        fi
-    done
- 
-    if (( ! server_started )); then
-        echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
-        echo "pkill -f vllm"
-        echo
-        pkill vllm
-        sleep 10
-        return 1
-    fi
-    
-    echo "run benchmark test..."
-    echo
-    meet_latency_requirement=0
-    # get a basic qps by using request-rate inf
-    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
-    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
-    python benchmarks/benchmark_serving.py \
-        --backend vllm \
-        --model $MODEL  \
-        --dataset-name sonnet \
-        --dataset-path benchmarks/sonnet_4x.txt \
-        --sonnet-input-len $INPUT_LEN \
-        --sonnet-output-len $OUTPUT_LEN \
-        --ignore-eos \
-        --disable-tqdm \
-        --request-rate inf \
-        --percentile-metrics ttft,tpot,itl,e2el \
-        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
-        --num-prompts 100 \
-        --sonnet-prefix-len $prefix_len \
-        --port 8004 > "$bm_log"
-    through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
-    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
-    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
-
-    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
-        meet_latency_requirement=1
-    fi
-
-    if (( ! meet_latency_requirement )); then
-    # start from request-rate as int(through_put) + 1
-        request_rate=$((${through_put%.*} + 1))
-        while ((request_rate > 0)); do
-            # clear prefix cache
-            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
-            sleep 5
-            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
-            python benchmarks/benchmark_serving.py \
-                --backend vllm \
-                --model $MODEL  \
-                --dataset-name sonnet \
-                --dataset-path benchmarks/sonnet_4x.txt \
-                --sonnet-input-len $INPUT_LEN \
-                --sonnet-output-len $OUTPUT_LEN \
-                --ignore_eos \
-                --disable-tqdm \
-                --request-rate $request_rate \
-                --percentile-metrics ttft,tpot,itl,e2el \
-                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
-                --num-prompts 100 \
-                --sonnet-prefix-len $prefix_len \
-                --port 8004 > "$bm_log"
-            through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
-            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
-            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
-            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
-                meet_latency_requirement=1
-                break
-            fi
-            request_rate=$((request_rate-1))
-        done
-    fi
-    # write the results and update the best result.
-    if ((meet_latency_requirement)); then
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
-        if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
-            best_throughput=$through_put
-            best_max_num_seqs=$max_num_seqs
-            best_num_batched_tokens=$max_num_batched_tokens
-            best_goodput=$goodput
-        fi
-    else
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
-    fi
-
-    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
-
-    echo "pkill -f vllm"
-    echo
-    pkill vllm
-    sleep 10
-    rm -f $vllm_log
-    printf '=%.0s' $(seq 1 20)
-    return 0
-}
-
-
-num_seqs_list="128 256"
-num_batched_tokens_list="512 1024 2048 4096"
-for num_seqs in $num_seqs_list; do
-    for num_batched_tokens in $num_batched_tokens_list; do
-        run_benchmark $num_seqs $num_batched_tokens
-        exit 0
-    done
-done
-echo "finish permutations"
-echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
-echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
-
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0

-import io
 import json
 import os
 import sys
@ -33,7 +32,6 @@ class RequestFuncInput:
    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False
-    language: Optional[str] = None


@dataclass
@ -65,7 +63,7 @@ async def async_request_tgi(
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
            "truncate": request_func_input.prompt_len,
-            "ignore_eos_token": request_func_input.ignore_eos,
+            # TGI does not accept ignore_eos flag.
        }
        payload = {
            "inputs": request_func_input.prompt,
@ -73,10 +71,6 @@ async def async_request_tgi(
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
-        if request_func_input.ignore_eos:
-            output.output_tokens = request_func_input.output_len
-        else:
-            output.output_tokens = None

        ttft = 0.0
        st = time.perf_counter()
@ -201,7 +195,6 @@ async def async_request_deepspeed_mii(
                                     timeout=AIOHTTP_TIMEOUT) as session:

        payload = {
-            "model": request_func_input.model,
            "prompt": request_func_input.prompt,
            "max_tokens": request_func_input.output_len,
            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
@ -222,15 +215,7 @@ async def async_request_deepspeed_mii(
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
-                    if "choices" in parsed_resp:
-                        output.generated_text = parsed_resp["choices"][0][
-                            "text"]
-                    elif "text" in parsed_resp:
-                        output.generated_text = parsed_resp["text"][0]
-                    else:
-                        output.error = ("Unexpected response format: "
-                                        "neither 'choices' nor 'text' found")
-                        output.success = False
+                    output.generated_text = parsed_resp["text"][0]
                    output.success = True
                else:
                    output.error = response.reason or ""
@ -261,7 +246,6 @@ async def async_request_openai_completions(
                if request_func_input.model_name else request_func_input.model,
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
-            "repetition_penalty": 1.0,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
@ -440,110 +424,6 @@ async def async_request_openai_chat_completions(
    return output


-async def async_request_openai_audio(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    # Lazy import without PlaceholderModule to avoid vllm dep.
-    import soundfile
-    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("transcriptions", "translations"
-         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
-    "or `translations`."
-
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
-        content = [{"type": "text", "text": request_func_input.prompt}]
-        payload = {
-            "model": request_func_input.model_name \
-                if request_func_input.model_name else request_func_input.model,
-            "temperature": 0.0,
-            "max_completion_tokens": request_func_input.output_len,
-            "stream": True,
-            "language": "en",
-            # Flattened due to multipart/form-data
-            "stream_include_usage": True,
-            "stream_continuous_usage_stats": True
-        }
-        if request_func_input.extra_body:
-            payload.update(request_func_input.extra_body)
-        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        }
-
-        # Send audio file
-        def to_bytes(y, sr):
-            buffer = io.BytesIO()
-            soundfile.write(buffer, y, sr, format="WAV")
-            buffer.seek(0)
-            return buffer
-
-        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
-            form = aiohttp.FormData()
-            form.add_field('file', f, content_type='audio/wav')
-            for key, value in payload.items():
-                form.add_field(key, str(value))
-
-            output = RequestFuncOutput()
-            output.prompt_len = request_func_input.prompt_len
-
-            generated_text = ""
-            ttft = 0.0
-            st = time.perf_counter()
-            most_recent_timestamp = st
-            try:
-                async with session.post(url=api_url,
-                                        data=form,
-                                        headers=headers) as response:
-                    if response.status == 200:
-                        async for chunk_bytes in response.content:
-                            chunk_bytes = chunk_bytes.strip()
-                            if not chunk_bytes:
-                                continue
-
-                            chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                "data: ")
-                            if chunk != "[DONE]":
-                                timestamp = time.perf_counter()
-                                data = json.loads(chunk)
-
-                                if choices := data.get("choices"):
-                                    content = choices[0]["delta"].get(
-                                        "content")
-                                    # First token
-                                    if ttft == 0.0:
-                                        ttft = timestamp - st
-                                        output.ttft = ttft
-
-                                    # Decoding phase
-                                    else:
-                                        output.itl.append(
-                                            timestamp - most_recent_timestamp)
-
-                                    generated_text += content or ""
-                                elif usage := data.get("usage"):
-                                    output.output_tokens = usage.get(
-                                        "completion_tokens")
-
-                                most_recent_timestamp = timestamp
-
-                        output.generated_text = generated_text
-                        output.success = True
-                        output.latency = most_recent_timestamp - st
-                    else:
-                        output.error = response.reason or ""
-                        output.success = False
-            except Exception:
-                output.success = False
-                exc_info = sys.exc_info()
-                output.error = "".join(traceback.format_exception(*exc_info))
-
-        if pbar:
-            pbar.update(1)
-        return output
-
-
 def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download
@ -601,14 +481,7 @@ ASYNC_REQUEST_FUNCS = {
    "deepspeed-mii": async_request_deepspeed_mii,
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
-    "openai-audio": async_request_openai_audio,
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
 }
-
-OPENAI_COMPATIBLE_BACKENDS = [
-    k for k, v in ASYNC_REQUEST_FUNCS.items()
-    if v in (async_request_openai_completions,
-             async_request_openai_chat_completions)
-]
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -17,14 +17,12 @@ SampleRequest instances, similar to the approach used in ShareGPT.
 import base64
 import io
 import json
-import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass
 from functools import cache
-from io import BytesIO
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union

 import numpy as np
 import pandas as pd
@ -37,8 +35,6 @@ from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer

-logger = logging.getLogger(__name__)
-
 # -----------------------------------------------------------------------------
 # Data Classes
 # -----------------------------------------------------------------------------
@ -64,7 +60,9 @@ class SampleRequest:

 class BenchmarkDataset(ABC):
    DEFAULT_SEED = 0
-    IS_MULTIMODAL = False
+
+    # num_requests has default 1000 in both the benchmark_serving.py and
+    # benchmark_throughput.py

    def __init__(
        self,
@ -92,8 +90,8 @@ class BenchmarkDataset(ABC):
            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
        """
        Transform a prompt and optional multimodal content into a chat format.
-        This method is used for chat models that expect a specific conversation
-        format.
+        This method is used for chat models that expect a specific 
+        conversation format.
        """
        content = [{"text": prompt, "type": "text"}]
        if mm_content is not None:
@ -103,10 +101,10 @@ class BenchmarkDataset(ABC):
    def load_data(self) -> None:
        """
        Load data from the dataset path into self.data.
-
+        
        This method must be overridden by subclasses since the method to load
        data will vary depending on the dataset format and source.
-
+        
        Raises:
            NotImplementedError: If a subclass does not implement this method.
        """
@ -123,18 +121,18 @@ class BenchmarkDataset(ABC):
        """
        Optionally select a random LoRA request and return its associated
        tokenizer.
-
+        
        This method is used when LoRA parameters are provided.  It randomly
        selects a LoRA based on max_loras and retrieves a cached tokenizer for
        that LoRA if available. Otherwise, it returns the base tokenizer.
-
+        
        Args:
            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
            LoRA is selected.  max_loras (Optional[int]): The maximum number of
            LoRAs available. If None, LoRA is not used.  lora_path
            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
            is not used.
-
+        
        Returns:
            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
            element is a LoRARequest (or None if not applicable) and the second
@ -162,39 +160,21 @@ class BenchmarkDataset(ABC):
               num_requests: int) -> list[SampleRequest]:
        """
        Abstract method to generate sample requests from the dataset.
-
+        
        Subclasses must override this method to implement dataset-specific logic
        for generating a list of SampleRequest objects.
-
+        
        Args:
            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
             for processing the dataset's text.
            num_requests (int): The number of sample requests to generate.
-
+        
        Returns:
            list[SampleRequest]: A list of sample requests generated from the
            dataset.
        """
        raise NotImplementedError("sample must be implemented in subclasses.")

-    def maybe_oversample_requests(self, requests: list[SampleRequest],
-                                  num_requests: int) -> None:
-        """
-        Oversamples the list of requests if its size is less than the desired
-        number.
-
-        Args:
-            requests (List[SampleRequest]): The current list of sampled
-            requests.  num_requests (int): The target number of requests.
-        """
-        if len(requests) < num_requests:
-            random.seed(self.random_seed)
-            additional = random.choices(requests,
-                                        k=num_requests - len(requests))
-            requests.extend(additional)
-            logger.info("Oversampled requests to reach %d total samples.",
-                        num_requests)
-

 # -----------------------------------------------------------------------------
 # Utility Functions and Global Caches
@ -241,24 +221,21 @@ def process_image(image: Any) -> Mapping[str, Any]:
    """
    Process a single image input and return a multimedia content dictionary.

-    Supports three input types:
+    For a PIL.Image.Image input:
+      - Converts the image to RGB.
+      - Saves the image as a JPEG in-memory.
+      - Encodes the JPEG data as a base64 string.
+      - Returns a dictionary with the image as a base64 data URL.

-    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
-       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
-
-    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
-       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
-       a dictionary with the image as a base64 data URL.
-
-    3. String input: - Treats the string as a URL or local file path.  -
-       Prepends "file://" if the string doesn't start with "http://" or
-       "file://".  - Returns a dictionary with the image URL.
+    For a string input:
+      - Treats the string as a URL or file path.
+      - Prepends "file://" if the string doesn't start with "http://" or
+        "file://".
+      - Returns a dictionary with the image URL.

    Raises:
-        ValueError: If the input is not a supported type.
+      ValueError: If the input is neither a PIL.Image.Image nor a string.
    """
-    if isinstance(image, dict) and 'bytes' in image:
-        image = Image.open(BytesIO(image['bytes']))
    if isinstance(image, Image.Image):
        image = image.convert("RGB")
        with io.BytesIO() as image_data:
@ -277,8 +254,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
            ("http://", "file://")) else f"file://{image}")
        return {"type": "image_url", "image_url": {"url": image_url}}

-    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
-                     " or str or dictionary with raw image bytes.")
+    raise ValueError(
+        f"Invalid image input {image}. Must be a PIL.Image.Image or str.")


 # -----------------------------------------------------------------------------
@ -289,7 +266,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
 class RandomDataset(BenchmarkDataset):
    # Default values copied from benchmark_serving.py for the random dataset.
    DEFAULT_PREFIX_LEN = 0
-    DEFAULT_RANGE_RATIO = 0.0
+    DEFAULT_RANGE_RATIO = 1.0
    DEFAULT_INPUT_LEN = 1024
    DEFAULT_OUTPUT_LEN = 128

@ -299,44 +276,28 @@ class RandomDataset(BenchmarkDataset):
    ) -> None:
        super().__init__(**kwargs)

-    def sample(
-        self,
-        tokenizer: PreTrainedTokenizerBase,
-        num_requests: int,
-        prefix_len: int = DEFAULT_PREFIX_LEN,
-        range_ratio: float = DEFAULT_RANGE_RATIO,
-        input_len: int = DEFAULT_INPUT_LEN,
-        output_len: int = DEFAULT_OUTPUT_LEN,
-        **kwargs,
-    ) -> list[SampleRequest]:
-        # Enforce range_ratio < 1
-        assert range_ratio < 1.0, (
-            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
-        )
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               prefix_len: int = DEFAULT_PREFIX_LEN,
+               range_ratio: float = DEFAULT_RANGE_RATIO,
+               input_len: int = DEFAULT_INPUT_LEN,
+               output_len: int = DEFAULT_OUTPUT_LEN,
+               **kwargs) -> list[SampleRequest]:

        vocab_size = tokenizer.vocab_size
-        num_special_tokens = tokenizer.num_special_tokens_to_add()
-        real_input_len = input_len - num_special_tokens

        prefix_token_ids = (np.random.randint(
            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])

-        # New sampling logic: [X * (1 - b), X * (1 + b)]
-        input_low = int(real_input_len * (1 - range_ratio))
-        input_high = int(real_input_len * (1 + range_ratio))
-        output_low = int(output_len * (1 - range_ratio))
-        output_high = int(output_len * (1 + range_ratio))
-
-        # Add logging for debugging
-        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
-        logger.info("Sampling output_len from [%s, %s]", output_low,
-                    output_high)
+        input_low = int(input_len * range_ratio)
+        output_low = int(output_len * range_ratio)

        input_lens = np.random.randint(input_low,
-                                       input_high + 1,
+                                       input_len + 1,
                                       size=num_requests)
        output_lens = np.random.randint(output_low,
-                                        output_high + 1,
+                                        output_len + 1,
                                        size=num_requests)
        offsets = np.random.randint(0, vocab_size, size=num_requests)

@ -346,17 +307,6 @@ class RandomDataset(BenchmarkDataset):
                         vocab_size).tolist()
            token_sequence = prefix_token_ids + inner_seq
            prompt = tokenizer.decode(token_sequence)
-            # After decoding the prompt we have to encode and decode it again.
-            # This is done because in some cases N consecutive tokens
-            # give a string tokenized into != N number of tokens.
-            # For example for GPT2Tokenizer:
-            # [6880, 6881] -> ['Ġcalls', 'here'] ->
-            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
-            # To avoid uncontrolled change of the prompt length,
-            # the encoded sequence is truncated before being decode again.
-            re_encoded_sequence = tokenizer.encode(
-                prompt, add_special_tokens=False)[:input_lens[i]]
-            prompt = tokenizer.decode(re_encoded_sequence)
            total_input_len = prefix_len + int(input_lens[i])
            requests.append(
                SampleRequest(
@ -396,24 +346,20 @@ class ShareGPTDataset(BenchmarkDataset):
        random.seed(self.random_seed)
        random.shuffle(self.data)

-    def sample(
-        self,
-        tokenizer: PreTrainedTokenizerBase,
-        num_requests: int,
-        lora_path: Optional[str] = None,
-        max_loras: Optional[int] = None,
-        output_len: Optional[int] = None,
-        enable_multimodal_chat: bool = False,
-        **kwargs,
-    ) -> list:
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               lora_path: Optional[str] = None,
+               max_loras: Optional[int] = None,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
        samples: list = []
        for entry in self.data:
            if len(samples) >= num_requests:
                break
-            prompt, completion = (
-                entry["conversations"][0]["value"],
-                entry["conversations"][1]["value"],
-            )
+            prompt, completion = entry["conversations"][0]["value"],\
+                entry["conversations"][1]["value"]

            lora_request, tokenizer = self.get_random_lora_request(
                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
@ -437,7 +383,6 @@ class ShareGPTDataset(BenchmarkDataset):
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
                ))
-        self.maybe_oversample_requests(samples, num_requests)
        return samples


@ -470,20 +415,19 @@ class SonnetDataset(BenchmarkDataset):
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = f.readlines()

-    def sample(
-        self,
-        tokenizer,
-        num_requests: int,
-        prefix_len: int = DEFAULT_PREFIX_LEN,
-        input_len: int = DEFAULT_INPUT_LEN,
-        output_len: int = DEFAULT_OUTPUT_LEN,
-        return_prompt_formatted: bool = False,
-        **kwargs,
-    ) -> list:
+    def sample(self,
+               tokenizer,
+               num_requests: int,
+               prefix_len: int = DEFAULT_PREFIX_LEN,
+               input_len: int = DEFAULT_INPUT_LEN,
+               output_len: int = DEFAULT_OUTPUT_LEN,
+               return_prompt_formatted: bool = False,
+               **kwargs) -> list:
        # Calculate average token length for a poem line.
        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
        avg_len = sum(len(tokens)
-                      for tokens in tokenized_lines) / len(tokenized_lines)
+                      for tokens in \
+                        tokenized_lines) / len(tokenized_lines)

        # Build the base prompt.
        base_prompt = "Pick as many lines as you can from these poem lines:\n"
@ -499,11 +443,11 @@ class SonnetDataset(BenchmarkDataset):

        # Determine how many poem lines to use.
        num_input_lines = round((input_len - base_offset) / avg_len)
-        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
+        num_prefix_lines = round((prefix_len - base_offset) / avg_len)
        prefix_lines = self.data[:num_prefix_lines]

        samples = []
-        while len(samples) < num_requests:
+        for _ in range(num_requests):
            extra_lines = random.choices(self.data,
                                         k=num_input_lines - num_prefix_lines)
            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
@ -511,14 +455,13 @@ class SonnetDataset(BenchmarkDataset):
            prompt_formatted = tokenizer.apply_chat_template(
                msg, add_generation_prompt=True, tokenize=False)
            prompt_len = len(tokenizer(prompt_formatted).input_ids)
-            if prompt_len <= input_len:
-                samples.append(
-                    SampleRequest(
-                        prompt=prompt_formatted
-                        if return_prompt_formatted else prompt,
-                        prompt_len=prompt_len,
-                        expected_output_len=output_len,
-                    ))
+            samples.append(
+                SampleRequest(
+                    prompt=prompt_formatted
+                    if return_prompt_formatted else prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
        return samples


@ -563,14 +506,12 @@ class BurstGPTDataset(BenchmarkDataset):
        # Convert the dataframe to a list of lists.
        return data.values.tolist()

-    def sample(
-        self,
-        tokenizer: PreTrainedTokenizerBase,
-        num_requests: int,
-        max_loras: Optional[int] = None,
-        lora_path: Optional[str] = None,
-        **kwargs,
-    ) -> list[SampleRequest]:
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               max_loras: Optional[int] = None,
+               lora_path: Optional[str] = None,
+               **kwargs) -> list[SampleRequest]:
        samples = []
        data = self._sample_loaded_data(num_requests=num_requests)
        for i in range(num_requests):
@ -594,48 +535,49 @@ class BurstGPTDataset(BenchmarkDataset):


 # -----------------------------------------------------------------------------
-# HuggingFace Dataset Base Implementation
+# HuggingFace Dataset Implementation
 # -----------------------------------------------------------------------------
-class HuggingFaceDataset(BenchmarkDataset):
-    """Base class for datasets hosted on HuggingFace."""

-    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
+
+class HuggingFaceDataset(BenchmarkDataset):
+    """
+    Dataset class for processing a HuggingFace dataset with conversation data
+    and optional images.
+    """
+    DEFAULT_NUM_REQUESTS = 1000

    def __init__(
        self,
-        dataset_path: str,
        dataset_split: str,
        dataset_subset: Optional[str] = None,
        **kwargs,
    ) -> None:
-        super().__init__(dataset_path=dataset_path, **kwargs)
-
+        super().__init__(**kwargs)
        self.dataset_split = dataset_split
        self.dataset_subset = dataset_subset
+
        self.load_data()

    def load_data(self) -> None:
-        """Load data from HuggingFace datasets."""
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided for loading data.")
+
        self.data = load_dataset(
            self.dataset_path,
            name=self.dataset_subset,
            split=self.dataset_split,
            streaming=True,
        )
-        self.data = self.data.shuffle(seed=self.random_seed)
-
-
-# -----------------------------------------------------------------------------
-# Conversation Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ConversationDataset(HuggingFaceDataset):
-    """Dataset for conversation data with multimodal support."""
-    SUPPORTED_DATASET_PATHS = {
-        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
-    }
-    IS_MULTIMODAL = True
+        if self.data.features is None or "conversations" \
+            not in self.data.features:
+            raise ValueError(
+                "HuggingFaceDataset currently only supports datasets with "
+                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
+                "Please consider contributing if you would like to add "
+                "support for additional dataset formats.")
+        # Shuffle and filter examples with at least 2 conversations.
+        self.data = self.data.shuffle(seed=self.random_seed).filter(
+            lambda x: len(x["conversations"]) >= 2)

    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
@ -643,13 +585,10 @@ class ConversationDataset(HuggingFaceDataset):
               output_len: Optional[int] = None,
               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
-        # Filter examples with at least 2 conversations
-        filtered_data = self.data.filter(
-            lambda x: len(x["conversations"]) >= 2)
        sampled_requests = []
        dynamic_output = output_len is None

-        for item in filtered_data:
+        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            conv = item["conversations"]
@ -679,7 +618,6 @@ class ConversationDataset(HuggingFaceDataset):
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests


@ -694,33 +632,44 @@ class VisionArenaDataset(HuggingFaceDataset):
    """

    DEFAULT_OUTPUT_LEN = 128
-    SUPPORTED_DATASET_PATHS = {
-        "lmarena-ai/VisionArena-Chat":
-        lambda x: x["conversation"][0][0]["content"],
-        "lmarena-ai/vision-arena-bench-v0.1":
-        lambda x: x["turns"][0][0]["content"]
-    }
-    IS_MULTIMODAL = True
+    DEFAULT_NUM_REQUESTS = 1000
+    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"

-    def sample(
+    def __init__(
        self,
-        tokenizer: PreTrainedTokenizerBase,
-        num_requests: int,
-        output_len: Optional[int] = None,
-        enable_multimodal_chat: bool = False,
        **kwargs,
-    ) -> list:
+    ) -> None:
+        super().__init__(**kwargs)
+        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
+            raise ValueError(f"Only support Vision Arena dataset.\
+                    This data path {self.dataset_path} is not valid.")
+        if self.dataset_subset is None and self.dataset_split != "train":
+            raise ValueError("Dataset split must be 'train'.")
+
+        self.load_data()
+
+    def load_data(self) -> None:
+        dataset = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = dataset.shuffle(seed=self.random_seed)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests = []
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
-            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
-            if parser_fn is None:
-                raise ValueError(
-                    f"Unsupported dataset path: {self.dataset_path}")
-            prompt = parser_fn(item)
+            prompt = item["turns"][0][0]["content"]
            mm_content = process_image(item["images"][0])
            prompt_len = len(tokenizer(prompt).input_ids)
            if enable_multimodal_chat:
@ -736,317 +685,4 @@ class VisionArenaDataset(HuggingFaceDataset):
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
-        return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Instruct Coder Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class InstructCoderDataset(HuggingFaceDataset):
-    """
-    InstructCoder Dataset.
-    https://huggingface.co/datasets/likaixin/InstructCoder
-
-    InstructCoder is the dataset designed for general code editing.  It consists
-    of 114,239 instruction-input-output triplets, and covers multiple distinct
-    code editing scenario.
-    """
-
-    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
-    SUPPORTED_DATASET_PATHS = {
-        "likaixin/InstructCoder",
-    }
-
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
-        sampled_requests = []
-        for item in self.data:
-            if len(sampled_requests) >= num_requests:
-                break
-            prompt = f"{item['instruction']}:\n{item['input']}"
-            prompt_len = len(tokenizer(prompt).input_ids)
-            sampled_requests.append(
-                SampleRequest(
-                    prompt=prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
-        return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# MT-Bench Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class MTBenchDataset(HuggingFaceDataset):
-    """
-    MT-Bench Dataset.
-    https://huggingface.co/datasets/philschmid/mt-bench
-
-    We create a single turn dataset for MT-Bench. 
-    This is similar to Spec decoding benchmark setup in vLLM
-    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
-    """ # noqa: E501
-
-    DEFAULT_OUTPUT_LEN = 256  # avg len used in SD bench in vLLM
-    SUPPORTED_DATASET_PATHS = {
-        "philschmid/mt-bench",
-    }
-
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
-        sampled_requests = []
-
-        for item in self.data:
-            if len(sampled_requests) >= num_requests:
-                break
-            prompt = item['turns'][0]
-
-            # apply template
-            prompt = tokenizer.apply_chat_template([{
-                "role": "user",
-                "content": prompt
-            }],
-                                                   add_generation_prompt=True,
-                                                   tokenize=False)
-
-            prompt_len = len(tokenizer(prompt).input_ids)
-            sampled_requests.append(
-                SampleRequest(
-                    prompt=prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
-        return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# AIMO Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class AIMODataset(HuggingFaceDataset):
-    """
-    Dataset class for processing a AIMO dataset with reasoning questions.
-    """
-    SUPPORTED_DATASET_PATHS = {
-        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
-        "AI-MO/NuminaMath-CoT"
-    }
-
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               **kwargs) -> list:
-        sampled_requests = []
-        dynamic_output = output_len is None
-
-        for item in self.data:
-            if len(sampled_requests) >= num_requests:
-                break
-            prompt, completion = item['problem'], item["solution"]
-
-            prompt_ids = tokenizer(prompt).input_ids
-            completion_ids = tokenizer(completion).input_ids
-            prompt_len = len(prompt_ids)
-            completion_len = len(completion_ids)
-            output_len = completion_len if dynamic_output else output_len
-            assert isinstance(output_len, int) and output_len > 0
-            if dynamic_output and not is_valid_sequence(prompt_len,
-                                                        completion_len,
-                                                        max_prompt_len=2048,
-                                                        max_total_len=32000):
-                continue
-            sampled_requests.append(
-                SampleRequest(
-                    prompt=prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                    multi_modal_data=None,
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
-        return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Next Edit Prediction Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-zeta_prompt = """### Instruction:
-You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
-
-### User Edits:
-
-{}
-
-### User Excerpt:
-
-{}
-
-### Response:
-
-""" # noqa: E501
-
-
-def _format_zeta_prompt(
-        sample: dict,
-        original_start_marker: str = "<|editable_region_start|>") -> dict:
-    """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
-    
-    This function formats examples from the NEP dataset 
-    into prompts and expected outputs. It could be 
-    further extended to support more NEP datasets.
-    
-    Args:
-        sample: The dataset sample containing events, 
-            inputs, and outputs.
-        original_start_marker: The marker indicating the 
-            start of the editable region. Defaults to 
-            "<|editable_region_start|>".
-            
-    Returns:
-        A dictionary with the formatted prompts and expected outputs.
-    """
-    events = sample["events"]
-    input = sample["input"]
-    output = sample["output"]
-    prompt = zeta_prompt.format(events, input)
-
-    # following the original implementation, extract the focused region
-    # from the raw output
-    output_start_index = output.find(original_start_marker)
-    output_focused_region = output[output_start_index:]
-    expected_output = output_focused_region
-
-    return {"prompt": prompt, "expected_output": expected_output}
-
-
-class NextEditPredictionDataset(HuggingFaceDataset):
-    """
-    Dataset class for processing a Next Edit Prediction dataset.
-    """
-
-    SUPPORTED_DATASET_PATHS = {
-        "zed-industries/zeta",
-    }
-    MAPPING_PROMPT_FUNCS = {
-        "zed-industries/zeta": _format_zeta_prompt,
-    }
-
-    def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
-               **kwargs):
-        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(
-            self.dataset_path)
-        if formatting_prompt_func is None:
-            raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
-        samples = []
-        for sample in self.data:
-            sample = formatting_prompt_func(sample)
-            samples.append(
-                SampleRequest(
-                    prompt=sample["prompt"],
-                    prompt_len=len(tokenizer(sample["prompt"]).input_ids),
-                    expected_output_len=len(
-                        tokenizer(sample["expected_output"]).input_ids),
-                ))
-            if len(samples) >= num_requests:
-                break
-        self.maybe_oversample_requests(samples, num_requests)
-        return samples
-
-
-# -----------------------------------------------------------------------------
-# ASR Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ASRDataset(HuggingFaceDataset):
-    """
-    Dataset class for processing a ASR dataset for transcription.
-    Tested on the following set:
-
-    +----------------+----------------------------------------+--------------------------+-----------------------------+
-    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
-    +----------------+----------------------------------------+--------------------------+-----------------------------+
-    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
-    |                |                                        |                          | release3-speaker-adaptation |
-    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
-    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
-    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
-    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
-    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
-    +----------------+----------------------------------------+--------------------------+-----------------------------+
-
-    """ # noqa: E501
-    SUPPORTED_DATASET_PATHS = {
-        "openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
-        "edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
-    }
-
-    DEFAULT_OUTPUT_LEN = 128
-    IS_MULTIMODAL = True
-
-    # TODO Whisper-specific. Abstract interface when more models are supported.
-    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
-                              "<|notimestamps|>"
-    skip_long_audios: bool = True
-
-    def sample(
-        self,
-        tokenizer: PreTrainedTokenizerBase,
-        num_requests: int,
-        output_len: Optional[int] = None,
-        **kwargs,
-    ) -> list:
-        import librosa
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
-        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
-        prompt_len = len(tokenizer(prompt).input_ids)
-        sampled_requests = []
-        skipped = 0
-        for item in self.data:
-            if len(sampled_requests) >= num_requests:
-                break
-            audio = item["audio"]
-            y, sr = audio["array"], audio["sampling_rate"]
-            duration_s = librosa.get_duration(y=y, sr=sr)
-            # Whisper max supported duration
-            if self.skip_long_audios and duration_s > 30:
-                skipped += 1
-                continue
-
-            mm_content = {"audio": (y, sr)}
-            sampled_requests.append(
-                SampleRequest(
-                    prompt=prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                    multi_modal_data=mm_content,
-                ))
-        if skipped:
-            logger.warning("%d samples discarded from dataset due to" \
-                           " their length being greater than" \
-                           " what Whisper supports.", skipped)
-        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -63,16 +63,14 @@ class Request:
    output_len: int


-def sample_tokens(tokenizer: PreTrainedTokenizerBase,
-                  length: int) -> list[int]:
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
    vocab = tokenizer.get_vocab()
-    all_special_ids = set(tokenizer.all_special_ids)
-
    # Remove the special tokens.
-    return random.choices(
-        [v for k, v in vocab.items() if k not in all_special_ids],
-        k=length,
-    )
+    vocab = {
+        k: v
+        for k, v in vocab.items() if k not in tokenizer.all_special_ids
+    }
+    return random.choices(list(vocab.values()), k=length)


 def sample_requests_from_dataset(
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -7,6 +7,9 @@ On the server side, run one of the following commands:
        --swap-space 16 \
        --disable-log-requests

+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
 On the client side, run:
    python benchmarks/benchmark_serving.py \
        --backend <backend> \
@ -34,8 +37,7 @@ from datetime import datetime
 from typing import Any, Optional

 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS,
-                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
@ -50,12 +52,9 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

-from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
-                               ConversationDataset, HuggingFaceDataset,
-                               InstructCoderDataset, MTBenchDataset,
-                               NextEditPredictionDataset, RandomDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
-                               VisionArenaDataset)
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json

 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -157,7 +156,7 @@ def calculate_metrics(
        if outputs[i].success:
            output_len = outputs[i].output_tokens

-            if not output_len:
+            if output_len is None:
                # We use the tokenizer to count the number of output tokens
                # for some serving backends instead of looking at
                # len(outputs[i].itl) since multiple output tokens may be
@ -262,7 +261,6 @@ async def benchmark(
    goodput_config_dict: dict[str, float],
    max_concurrency: Optional[int],
    lora_modules: Optional[Iterable[str]],
-    extra_body: Optional[dict],
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -275,6 +273,10 @@ async def benchmark(
        input_requests[0].expected_output_len, \
            input_requests[0].multi_modal_data

+    if backend != "openai-chat" and test_mm_content is not None:
+        # multi-modal benchmark is only available on OpenAI Chat backend.
+        raise ValueError(
+            "Multi-modal content is only supported on 'openai-chat' backend.")
    assert test_mm_content is None or isinstance(test_mm_content, dict)
    test_input = RequestFuncInput(
        model=model_id,
@ -286,7 +288,6 @@ async def benchmark(
        logprobs=logprobs,
        multi_modal_content=test_mm_content,
        ignore_eos=ignore_eos,
-        extra_body=extra_body,
    )

    test_output = await request_func(request_func_input=test_input)
@ -313,8 +314,7 @@ async def benchmark(
                                         output_len=test_output_len,
                                         logprobs=logprobs,
                                         multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos,
-                                         extra_body=extra_body)
+                                         ignore_eos=ignore_eos)
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")
@ -364,8 +364,7 @@ async def benchmark(
                                              output_len=output_len,
                                              logprobs=logprobs,
                                              multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos,
-                                              extra_body=extra_body)
+                                              ignore_eos=ignore_eos)
        tasks.append(
            asyncio.create_task(
                limited_request_func(request_func_input=request_func_input,
@ -587,55 +586,19 @@ def main(args: argparse.Namespace):
                                            return_prompt_formatted=True)

    elif args.dataset_name == "hf":
-        # all following datasets are implemented from the
-        # HuggingFaceDataset base class
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = VisionArenaDataset
-            args.hf_split = "train"
-            args.hf_subset = None
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = InstructCoderDataset
-            args.hf_split = "train"
-        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = MTBenchDataset
-            args.hf_split = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = ConversationDataset
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = AIMODataset
-            args.hf_split = "train"
-        elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS:  # noqa: E501
-            dataset_class = NextEditPredictionDataset
-            args.hf_split = "train"
-        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = ASRDataset
-            args.hf_split = "train"
-        else:
-            supported_datasets = set([
-                dataset_name for cls in HuggingFaceDataset.__subclasses__()
-                for dataset_name in cls.SUPPORTED_DATASET_PATHS
-            ])
-            raise ValueError(
-                f"Unsupported dataset path: {args.dataset_path}. "
-                "Huggingface dataset only supports dataset_path"
-                f" from one of following: {supported_datasets}. "
-                "Please consider contributing if you would "
-                "like to add support for additional dataset formats.")
-
-        if (dataset_class.IS_MULTIMODAL and backend not in \
-            ["openai-chat", "openai-audio"]):
-            # multi-modal benchmark is only available on OpenAI Chat backend.
-            raise ValueError(
-                "Multi-modal content is only supported on 'openai-chat' and " \
-                "'openai-audio' backend.")
+        # Choose between VisionArenaDataset
+        # and HuggingFaceDataset based on provided parameters.
+        dataset_class = (VisionArenaDataset if args.dataset_path
+                         == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                         and args.hf_subset is None else HuggingFaceDataset)
        input_requests = dataset_class(
            dataset_path=args.dataset_path,
            dataset_subset=args.hf_subset,
            dataset_split=args.hf_split,
-            random_seed=args.seed,
        ).sample(
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
+            random_seed=args.seed,
            output_len=args.hf_output_len,
        )

@ -670,26 +633,6 @@ def main(args: argparse.Namespace):
            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
    goodput_config_dict = check_goodput_args(args)

-    # Collect the sampling parameters.
-    sampling_params = {
-        k: v
-        for k, v in {
-            "top_p": args.top_p,
-            "top_k": args.top_k,
-            "min_p": args.min_p,
-            "temperature": args.temperature
-        }.items() if v is not None
-    }
-
-    # Sampling parameters are only supported by openai-compatible backend.
-    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
-        raise ValueError(
-            "Sampling parameters are only supported by openai-compatible "
-            "backends.")
-
-    if "temperature" not in sampling_params:
-        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
-
    # Avoid GC processing "static" data - reduce pause times.
    gc.collect()
    gc.freeze()
@ -716,11 +659,10 @@ def main(args: argparse.Namespace):
            goodput_config_dict=goodput_config_dict,
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
-            extra_body=sampling_params,
        ))

    # Save config and results to json
-    if args.save_result or args.append_result:
+    if args.save_result:
        result_json: dict[str, Any] = {}

        # Setup
@ -741,14 +683,6 @@ def main(args: argparse.Namespace):
                    raise ValueError(
                        "Invalid metadata format. Please use KEY=VALUE format."
                    )
-        # Traffic
-        result_json["request_rate"] = (args.request_rate if args.request_rate
-                                       < float("inf") else "inf")
-        result_json["burstiness"] = args.burstiness
-        result_json["max_concurrency"] = args.max_concurrency
-
-        # Merge with benchmark result
-        result_json = {**result_json, **benchmark_result}

        if not args.save_detailed:
            # Remove fields with too many data points
@ -759,6 +693,15 @@ def main(args: argparse.Namespace):
                if field in result_json:
                    del result_json[field]

+        # Traffic
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+
        # Save to file
        base_model_id = model_id.split("/")[-1]
        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
@ -768,12 +711,7 @@ def main(args: argparse.Namespace):
            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name,
-                  mode="a+" if args.append_result else "w",
-                  encoding='utf-8') as outfile:
-            # Append a newline.
-            if args.append_result and outfile.tell() != 0:
-                outfile.write("\n")
+        with open(file_name, "w", encoding='utf-8') as outfile:
            json.dump(result_json, outfile)
        save_to_pytorch_benchmark_format(args, result_json, file_name)

@ -905,11 +843,6 @@ if __name__ == "__main__":
        help="When saving the results, whether to include per request "
        "information such as response, error, ttfs, tpots, etc.",
    )
-    parser.add_argument(
-        "--append-result",
-        action="store_true",
-        help="Append the benchmark result to the existing json file.",
-    )
    parser.add_argument(
        "--metadata",
        metavar="KEY=VALUE",
@ -943,7 +876,7 @@ if __name__ == "__main__":
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-seperated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
        "Default value is \"ttft,tpot,itl\".")
@ -951,7 +884,7 @@ if __name__ == "__main__":
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-separated list of percentiles for selected metrics. "
+        help="Comma-seperated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
@ -1018,23 +951,18 @@ if __name__ == "__main__":
    random_group.add_argument(
        "--random-range-ratio",
        type=float,
-        default=0.0,
-        help="Range ratio for sampling input/output length, "
-        "used only for random sampling. Must be in the range [0, 1) to define "
-        "a symmetric sampling range"
-        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
    )
    random_group.add_argument(
        "--random-prefix-len",
        type=int,
        default=0,
-        help=("Number of fixed prefix tokens before the random context "
-              "in a request. "
-              "The total input length is the sum of `random-prefix-len` and "
-              "a random "
-              "context length sampled from [input_len * (1 - range_ratio), "
-              "input_len * (1 + range_ratio)]."),
-    )
+        help="Number of fixed prefix tokens before random "
+        " context. The length range of context in a random "
+        " request is [random-prefix-len, "
+        " random-prefix-len + random-prefix-len * random-range-ratio).")

    hf_group = parser.add_argument_group("hf dataset options")
    hf_group.add_argument("--hf-subset",
@ -1053,33 +981,6 @@ if __name__ == "__main__":
        "from the sampled HF dataset.",
    )

-    sampling_group = parser.add_argument_group("sampling parameters")
-    sampling_group.add_argument(
-        "--top-p",
-        type=float,
-        default=None,
-        help="Top-p sampling parameter. Only has effect on openai-compatible "
-        "backends.")
-    sampling_group.add_argument(
-        "--top-k",
-        type=int,
-        default=None,
-        help="Top-k sampling parameter. Only has effect on openai-compatible "
-        "backends.")
-    sampling_group.add_argument(
-        "--min-p",
-        type=float,
-        default=None,
-        help="Min-p sampling parameter. Only has effect on openai-compatible "
-        "backends.")
-    sampling_group.add_argument(
-        "--temperature",
-        type=float,
-        default=None,
-        help="Temperature sampling parameter. Only has effect on "
-        "openai-compatible backends. If not specified, default to greedy "
-        "decoding (i.e. temperature==0.0).")
-
    parser.add_argument(
        '--tokenizer-mode',
        type=str,
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -5,13 +5,16 @@ On the server side, run one of the following commands:
    (vLLM OpenAI API server)
    vllm serve <your_model> --disable-log-requests

+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
 On the client side, run:
    python benchmarks/benchmark_serving_structured_output.py \
        --backend <backend> \
        --model <your_model> \
        --dataset json \
        --structured-output-ratio 1.0 \
-        --structured-output-backend auto \
+        --structured-output-backend xgrammar \
        --request-rate 10 \
        --num-prompts 1000

@ -51,7 +54,7 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

-from vllm.v1.structured_output.backend_xgrammar import (
+from vllm.v1.structured_output.utils import (
    has_xgrammar_unsupported_json_features)

 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -123,8 +126,6 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                copy.deepcopy(schema) for _ in range(args.num_prompts)
            ]
            for i in range(len(json_schemas)):
-                if "properties" not in json_schemas[i]:
-                    json_schemas[i]["properties"] = {}
                json_schemas[i]["properties"][
                    f"__optional_field_{uuid.uuid4()}"] = {
                        "type":
@ -132,11 +133,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                        "description":
                        "An unique optional field to avoid cached schemas"
                    }
-        else:
-            json_schemas = [schema] * args.num_prompts

        def gen_prompt(index: int):
-            return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
+            schema = json_schemas[index % len(json_schemas)]
+            return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501

        def get_schema(index: int):
            return json_schemas[index % len(json_schemas)]
@ -152,17 +152,17 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,

    elif args.dataset == "grammar":
        schema = """
-        root ::= select_statement
+            ?start: select_statement

-        select_statement ::= "SELECT " column " from " table " where " condition
+            ?select_statement: "SELECT " column_list " FROM " table_name

-        column ::= "col_1 " | "col_2 "
+            ?column_list: column_name ("," column_name)*

-        table ::= "table_1 " | "table_2 "
+            ?table_name: identifier

-        condition ::= column "= " number
+            ?column_name: identifier

-        number ::= "1 " | "2 "
+            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
        """
        prompt = "Generate an SQL query to show the 'username' \
            and 'email' from the 'users' table."
@ -233,8 +233,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                idx -= len_dataset
            schema = dataset["schema"][idx]
            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
-                                                   tokenize=False,
-                                                   add_generation_prompt=True)
+                                                   tokenize=False)
            input_len = len(tokenizer(prompt).input_ids)
            completion = dataset["completion"][idx]

@ -414,6 +413,7 @@ async def benchmark(
    ignore_eos: bool,
    max_concurrency: Optional[int],
    structured_output_ratio: float,
+    structured_output_backend: str,
    goodput_config_dict: Optional[dict[str, float]] = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
@ -425,6 +425,8 @@ async def benchmark(
        extra_body = {}
        # Add the schema to the extra_body
        extra_body[request.structure_type] = request.schema
+        # Add the specific structured_output_backend
+        extra_body["guided_decoding_backend"] = structured_output_backend
        return extra_body

    print("Starting initial single prompt test run...")
@ -730,11 +732,8 @@ def main(args: argparse.Namespace):
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
        base_url = f"http://{args.host}:{args.port}"

-    tokenizer = get_tokenizer(
-        tokenizer_id,
-        trust_remote_code=args.trust_remote_code,
-        tokenizer_mode=args.tokenizer_mode,
-    )
+    tokenizer = get_tokenizer(tokenizer_id,
+                              trust_remote_code=args.trust_remote_code)

    if args.dataset == 'grammar':
        args.structure_type = 'guided_grammar'
@ -782,6 +781,7 @@ def main(args: argparse.Namespace):
            ignore_eos=args.ignore_eos,
            max_concurrency=args.max_concurrency,
            structured_output_ratio=args.structured_output_ratio,
+            structured_output_backend=args.structured_output_backend,
            goodput_config_dict=goodput_config_dict,
        ))

@ -848,7 +848,7 @@ if __name__ == "__main__":
                            'json', 'json-unique', 'grammar', 'regex',
                            'choice', 'xgrammar_bench'
                        ])
-    parser.add_argument("--json-schema-path",
+    parser.add_argument("--json_schema_path",
                        type=str,
                        default=None,
                        help="Path to json schema.")
@ -876,13 +876,6 @@ if __name__ == "__main__":
        help=
        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
-    parser.add_argument(
-        "--tokenizer-mode",
-        type=str,
-        default="auto",
-        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
-    )
    parser.add_argument(
        "--num-prompts",
        type=int,
@ -963,7 +956,7 @@ if __name__ == "__main__":
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-seperated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
        "Default value is \"ttft,tpot,itl\".")
@ -971,7 +964,7 @@ if __name__ == "__main__":
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-separated list of percentiles for selected metrics. "
+        help="Comma-seperated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
@ -996,6 +989,11 @@ if __name__ == "__main__":
                        type=float,
                        default=1.0,
                        help="Ratio of Structured Outputs requests")
+    parser.add_argument("--structured-output-backend",
+                        type=str,
+                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
+                        default="xgrammar",
+                        help="Backend to use for structured outputs")

    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -11,8 +11,7 @@ from typing import Any, Optional, Union

 import torch
 import uvloop
-from benchmark_dataset import (AIMODataset, BurstGPTDataset,
-                               ConversationDataset, InstructCoderDataset,
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
                               RandomDataset, SampleRequest, ShareGPTDataset,
                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -213,17 +212,14 @@ def run_hf(
    max_prompt_len = 0
    max_output_len = 0
    for i in range(len(requests)):
-        prompt = requests[i].prompt
-        prompt_len = requests[i].prompt_len
-        output_len = requests[i].expected_output_len
+        prompt, prompt_len, output_len = requests[i]
        # Add the prompt to the batch.
        batch.append(prompt)
        max_prompt_len = max(max_prompt_len, prompt_len)
        max_output_len = max(max_output_len, output_len)
        if len(batch) < max_batch_size and i != len(requests) - 1:
            # Check if we can add more requests to the batch.
-            next_prompt_len = requests[i + 1].prompt_len
-            next_output_len = requests[i + 1].expected_output_len
+            _, next_prompt_len, next_output_len = requests[i + 1]
            if (max(max_prompt_len, next_prompt_len) +
                    max(max_output_len, next_output_len)) <= 2048:
                # We can add more requests to the batch.
@ -304,7 +300,6 @@ def get_requests(args, tokenizer):
        "input_len": args.input_len,
        "output_len": args.output_len,
    }
-
    if args.dataset_path is None or args.dataset_name == "random":
        sample_kwargs["range_ratio"] = args.random_range_ratio
        sample_kwargs["prefix_len"] = args.prefix_len
@ -322,23 +317,18 @@ def get_requests(args, tokenizer):
    elif args.dataset_name == "burstgpt":
        dataset_cls = BurstGPTDataset
    elif args.dataset_name == "hf":
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = VisionArenaDataset
-            common_kwargs['dataset_subset'] = None
-            common_kwargs['dataset_split'] = "train"
-            sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = InstructCoderDataset
-            common_kwargs['dataset_split'] = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = ConversationDataset
-            common_kwargs['dataset_subset'] = args.hf_subset
-            common_kwargs['dataset_split'] = args.hf_split
-            sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = AIMODataset
-            common_kwargs['dataset_subset'] = None
-            common_kwargs['dataset_split'] = "train"
+        if args.backend != "vllm-chat":
+            raise ValueError(
+                "hf datasets only are supported by vllm-chat backend")
+        # Choose between VisionArenaDataset and HuggingFaceDataset based on
+        # provided parameters.
+        dataset_cls = (VisionArenaDataset if args.dataset_path
+                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                       and args.hf_subset is None else HuggingFaceDataset)
+        common_kwargs['dataset_subset'] = args.hf_subset
+        common_kwargs['dataset_split'] = args.hf_split
+        sample_kwargs["enable_multimodal_chat"] = True
+
    else:
        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
    # Remove None values
@ -472,17 +462,9 @@ def validate_args(args):
        warnings.warn("--hf-subset and --hf-split will be ignored \
                since --dataset-name is not 'hf'.",
                      stacklevel=2)
-    elif args.dataset_name == "hf":
-        if args.dataset_path in (
-                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
-                | ConversationDataset.SUPPORTED_DATASET_PATHS):
-            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
-        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
-                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
-            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
-        else:
-            raise ValueError(
-                f"{args.dataset_path} is not supported by hf dataset.")
+    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
+        raise ValueError(
+            "When --dataset-name is 'hf', backend must be 'vllm-chat'")

    # --random-range-ratio: only used when dataset_name is 'random'
    if args.dataset_name != 'random' and args.random_range_ratio is not None:
@ -523,13 +505,6 @@ def validate_args(args):
        raise ValueError(
            "Tokenizer must be the same as the model for MII backend.")

-    # --data-parallel is not supported currently.
-    # https://github.com/vllm-project/vllm/issues/16222
-    if args.data_parallel_size > 1:
-        raise ValueError(
-            "Data parallel is not supported in offline benchmark, \
-            please use benchmark serving instead")
-

 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
@ -601,30 +576,18 @@ if __name__ == "__main__":
        default=None,
        help="Path to the lora adapters to use. This can be an absolute path, "
        "a relative path, or a Hugging Face model identifier.")
-    parser.add_argument(
-        "--prefix-len",
-        type=int,
-        default=None,
-        help=f"Number of prefix tokens to be used in RandomDataset "
-        "and SonnetDataset. For RandomDataset, the total input "
-        "length is the sum of prefix-len (default: "
-        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
-        "sampled from [input_len * (1 - range_ratio), "
-        "input_len * (1 + range_ratio)]. For SonnetDataset, "
-        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
-        "controls how much of the input is fixed lines versus "
-        "random lines, but the total input length remains approximately "
-        "input_len tokens.")
+    parser.add_argument("--prefix-len",
+                        type=int,
+                        default=None,
+                        help="Number of prefix tokens per request."
+                        "This is for the RandomDataset and SonnetDataset")
    # random dataset
    parser.add_argument(
        "--random-range-ratio",
        type=float,
        default=None,
-        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
-        "for sampling input/output length, "
-        "used only for RandomDataset. Must be in the range [0, 1) to "
-        "define a symmetric sampling range "
-        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+        help="Range of sampled ratio of input/output length, "
+        "used only for RandomDataSet.",
    )

    # hf dtaset
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@ -1,236 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
-    MINIMUM_BITBLAS_VERSION)
-
-try:
-    import bitblas
-    if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
-        raise ImportError("bitblas version is wrong. Please "
-                          f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
-except ImportError as e:
-    bitblas_import_exception = e
-    raise ValueError("Trying to use the bitblas backend, but could not import"
-                     f"with the following error: {bitblas_import_exception}. "
-                     "Please install bitblas through the following command: "
-                     f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
-                     ) from bitblas_import_exception
-
-from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
-
-from vllm.utils import FlexibleArgumentParser
-
-parser = FlexibleArgumentParser(
-    description="Benchmark BitBLAS int4 on a specific target.")
-
-# Add arguments to the parser
-parser.add_argument(
-    "--target",
-    type=str,
-    default=auto_detect_nvidia_target(),
-    help="Specify the target device for benchmarking.",
-)
-parser.add_argument("--group_size",
-                    type=int,
-                    default=None,
-                    help="Group size for grouped quantization.")
-parser.add_argument(
-    "--A_dtype",
-    type=str,
-    default="float16",
-    choices=["float16", "float32", "float64", "int32", "int8"],
-    help="Data type of activation A.",
-)
-parser.add_argument(
-    "--W_dtype",
-    type=str,
-    default="int4",
-    choices=[
-        "float16",
-        "float32",
-        "float64",
-        "int32",
-        "int8",
-        "int4",
-        "int2",
-        "int1",
-        "nf4",
-        "fp4_e2m1",
-    ],
-    help="Data type of weight W.",
-)
-parser.add_argument(
-    "--accum_dtype",
-    type=str,
-    default="float16",
-    choices=["float16", "int32"],
-    help="Data type for accumulation.",
-)
-parser.add_argument(
-    "--out_dtype",
-    type=str,
-    default="float16",
-    choices=["float16", "float32", "int32", "int8"],
-    help="Data type for output.",
-)
-parser.add_argument(
-    "--layout",
-    type=str,
-    default="nt",
-    choices=["nt", "nn"],
-    help="Matrix layout, 'nt' for non-transpose A and transpose W.",
-)
-parser.add_argument("--with_bias",
-                    action="store_true",
-                    help="Include bias in the benchmark.")
-parser.add_argument(
-    "--with_scaling",
-    action="store_true",
-    help="Include scaling factor in the quantization.",
-)
-parser.add_argument("--with_zeros",
-                    action="store_true",
-                    help="Include zeros in the quantization.")
-parser.add_argument(
-    "--zeros_mode",
-    type=str,
-    default=None,
-    choices=["original", "rescale", "quantized"],
-    help="Specify the mode for calculating zeros.",
-)
-
-# Parse the arguments
-args = parser.parse_args()
-
-# Assign arguments to variables
-target = args.target
-A_dtype = args.A_dtype
-W_dtype = args.W_dtype
-accum_dtype = args.accum_dtype
-out_dtype = args.out_dtype
-layout = args.layout
-with_bias = args.with_bias
-group_size = args.group_size
-with_scaling = args.with_scaling
-with_zeros = args.with_zeros
-zeros_mode = args.zeros_mode
-
-# Define a list of shared arguments that repeat in every config
-shared_args = [
-    A_dtype,
-    W_dtype,
-    out_dtype,
-    accum_dtype,
-    layout,
-    with_bias,
-    group_size,
-    with_scaling,
-    with_zeros,
-    zeros_mode,
-]
-
-# Define just the (M, K, N) shapes in a more compact list
-shapes = [
-    # square test
-    (1, 16384, 16384),
-    # BLOOM-176B
-    (1, 43008, 14336),
-    (1, 14336, 14336),
-    (1, 57344, 14336),
-    (1, 14336, 57344),
-    # OPT-65B
-    (1, 9216, 9216),
-    (1, 36864, 9216),
-    (1, 9216, 36864),
-    (1, 22016, 8192),
-    # LLAMA-70B/65B
-    (1, 8192, 22016),
-    (1, 8192, 8192),
-    (1, 28672, 8192),
-    (1, 8192, 28672),
-    # square test
-    (16384, 16384, 16384),
-    # BLOOM-176B
-    (8192, 43008, 14336),
-    (8192, 14336, 14336),
-    (8192, 57344, 14336),
-    (8192, 14336, 57344),
-    # OPT-65B
-    (8192, 9216, 9216),
-    (8192, 36864, 9216),
-    (8192, 9216, 36864),
-    (8192, 22016, 8192),
-    # LLAMA-70B/65B
-    (8192, 8192, 22016),
-    (8192, 8192, 8192),
-    (8192, 28672, 8192),
-    (8192, 8192, 28672),
-]
-
-# Build test shapes with all the shared arguments
-test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args))
-               for shape in shapes]
-
-benchmark_sets = []
-benchmark_sets.extend(test_shapes)
-
-benchmark_results = {}
-for config_class, operator, input_args in benchmark_sets:
-    config = config_class(*input_args)
-    matmul = operator(config, target=target, enable_tuning=True)
-    kernel_latency = matmul.profile_latency()
-
-    print("Time cost is: {:.3f} ms".format(kernel_latency))
-
-    profile_config = {
-        f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "BitBLAS_top20_latency": kernel_latency,
-        }
-    }
-
-    benchmark_results.update(profile_config)
-
-# Define headers for the table
-headers = [
-    "PrimFunc",
-    "Input Arguments",
-    "BitBLAS Top20 Latency",
-]
-
-# Calculate column widths for pretty printing
-col_widths = [0, 0, 0]
-for config_key, values in benchmark_results.items():
-    args_split = config_key.split("-")
-    func_name = args_split[0]
-    input_args_str = "-".join(args_split[1:])
-    col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
-    col_widths[1] = max(col_widths[1],
-                        len(input_args_str) + 2,
-                        len(headers[1]) + 2)
-    col_widths[2] = max(col_widths[2],
-                        len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
-                        len(headers[2]) + 2)
-    # break only if you want to measure widths from a single example;
-    # otherwise, let it loop over all items.
-
-# Print header
-for i, header in enumerate(headers):
-    headers[i] = header.ljust(col_widths[i])
-print("".join(headers))
-print("-" * sum(col_widths))
-
-# Print rows
-for config_key, values in benchmark_results.items():
-    args_split = config_key.split("-")
-    func_name = args_split[0]
-    input_args_str = "-".join(args_split[1:])
-    row = [
-        func_name,
-        input_args_str,
-        f"{values['BitBLAS_top20_latency']:.3f} ms",
-    ]
-    row_str = "".join(
-        [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)])
-    print(row_str)
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -1,341 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-import torch.utils.benchmark as benchmark
-from benchmark_shapes import WEIGHT_SHAPES_MOE
-
-from vllm import _custom_ops as ops
-from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
-                                                            fused_experts,
-                                                            fused_topk)
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = [
-    "nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite",
-    "ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m"
-]
-DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-PER_ACT_TOKEN_OPTS = [False]
-PER_OUT_CH_OPTS = [False]
-
-
-def to_fp8(tensor: torch.Tensor):
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def bench_run(results: list[benchmark.Measurement], model: str,
-              num_experts: int, topk: int, per_act_token: bool,
-              per_out_ch: bool, mkn: tuple[int, int, int]):
-    label = "Quant Matmul"
-
-    sub_label = (
-        "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, "
-        "MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch,
-                          mkn))
-
-    print(f"Testing: {sub_label}")
-
-    (m, k, n) = mkn
-
-    dtype = torch.half
-
-    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
-    w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
-
-    _, a_scale = ops.scaled_fp8_quant(a)
-
-    w1_q = torch.empty((num_experts, 2 * n, k),
-                       device="cuda",
-                       dtype=torch.float8_e4m3fn)
-    w2_q = torch.empty((num_experts, k, n),
-                       device="cuda",
-                       dtype=torch.float8_e4m3fn)
-    w1_scale = torch.empty((num_experts, 1, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-    w2_scale = torch.empty((num_experts, 1, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-
-    ab_strides1 = torch.full((num_experts, ),
-                             k,
-                             device="cuda",
-                             dtype=torch.int64)
-    c_strides1 = torch.full((num_experts, ),
-                            2 * n,
-                            device="cuda",
-                            dtype=torch.int64)
-    ab_strides2 = torch.full((num_experts, ),
-                             n,
-                             device="cuda",
-                             dtype=torch.int64)
-    c_strides2 = torch.full((num_experts, ),
-                            k,
-                            device="cuda",
-                            dtype=torch.int64)
-
-    for expert in range(num_experts):
-        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
-        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
-    w1_q_notransp = w1_q.clone()
-    w2_q_notransp = w2_q.clone()
-    w1_q = w1_q.transpose(1, 2)
-    w2_q = w2_q.transpose(1, 2)
-
-    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
-
-    topk_weights, topk_ids, token_expert_indices = fused_topk(
-        a, score, topk, renormalize=False)
-
-    def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
-                       topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                       w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-                       a_scale: torch.Tensor, num_repeats: int):
-        for _ in range(num_repeats):
-            fused_experts(a,
-                          w1,
-                          w2,
-                          topk_weights,
-                          topk_ids,
-                          use_fp8_w8a8=True,
-                          w1_scale=w1_scale,
-                          w2_scale=w2_scale,
-                          a1_scale=a_scale)
-
-    def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor,
-                        w1: torch.Tensor, w2: torch.Tensor,
-                        w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-                        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
-                        ab_strides2: torch.Tensor, c_strides2: torch.Tensor,
-                        num_repeats: int):
-        for _ in range(num_repeats):
-            cutlass_moe_fp8(a,
-                            w1,
-                            w2,
-                            w1_scale,
-                            w2_scale,
-                            topk_weights,
-                            topk_ids,
-                            ab_strides1,
-                            c_strides1,
-                            ab_strides2,
-                            c_strides2,
-                            a1_scale=a_scale)
-
-    def run_cutlass_from_graph(
-            a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
-            w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-            ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
-            ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
-        with set_current_vllm_config(
-                VllmConfig(parallel_config=ParallelConfig(
-                    pipeline_parallel_size=1))):
-            return cutlass_moe_fp8(a,
-                                   w1_q,
-                                   w2_q,
-                                   w1_scale,
-                                   w2_scale,
-                                   topk_weights,
-                                   topk_ids,
-                                   ab_strides1,
-                                   c_strides1,
-                                   ab_strides2,
-                                   c_strides2,
-                                   a1_scale=a_scale)
-
-    def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor,
-                              w2: torch.Tensor, topk_weights: torch.Tensor,
-                              topk_ids: torch.Tensor, w1_scale: torch.Tensor,
-                              w2_scale: torch.Tensor, a_scale: torch.Tensor):
-        with set_current_vllm_config(
-                VllmConfig(parallel_config=ParallelConfig(
-                    pipeline_parallel_size=1))):
-            return fused_experts(a,
-                                 w1,
-                                 w2,
-                                 topk_weights,
-                                 topk_ids,
-                                 use_fp8_w8a8=True,
-                                 w1_scale=w1_scale,
-                                 w2_scale=w2_scale,
-                                 a1_scale=a_scale)
-
-    def replay_graph(graph, num_repeats):
-        for _ in range(num_repeats):
-            graph.replay()
-        torch.cuda.synchronize()
-
-    cutlass_stream = torch.cuda.Stream()
-    cutlass_graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
-        run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale,
-                               topk_weights, topk_ids, ab_strides1, c_strides1,
-                               ab_strides2, c_strides2)
-    torch.cuda.synchronize()
-
-    triton_stream = torch.cuda.Stream()
-    triton_graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(triton_graph, stream=triton_stream):
-        run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights,
-                              topk_ids, w1_scale, w2_scale, a_scale)
-    torch.cuda.synchronize()
-
-    min_run_time = 5
-    num_warmup = 5
-    num_runs = 25
-
-    globals = {
-        # Baseline params
-        "w1": w1,
-        "w2": w2,
-        "score": score,
-        "topk": topk,
-        "w1_q_notransp": w1_q_notransp,
-        "w2_q_notransp": w2_q_notransp,
-        # Cutlass params
-        "a_scale": a_scale,
-        "w1_q": w1_q,
-        "w2_q": w2_q,
-        "w1_scale": w1_scale,
-        "w2_scale": w2_scale,
-        "ab_strides1": ab_strides1,
-        "c_strides1": c_strides1,
-        "ab_strides2": ab_strides2,
-        "c_strides2": c_strides2,
-        # cuda graph params
-        "cutlass_graph": cutlass_graph,
-        "triton_graph": triton_graph,
-        # Gen params
-        "a": a,
-        "topk_weights": topk_weights,
-        "topk_ids": topk_ids,
-        "num_runs": num_runs,
-        # Kernels
-        "run_triton_moe": run_triton_moe,
-        "run_cutlass_moe": run_cutlass_moe,
-        "replay_graph": replay_graph,
-    }
-
-    # Warmup
-    run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids,
-                   w1_scale, w2_scale, a_scale, num_warmup)
-
-    results.append(
-        benchmark.Timer(
-            stmt=
-            "run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="triton_moe",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    # Warmup
-    replay_graph(triton_graph, num_warmup)
-
-    results.append(
-        benchmark.Timer(
-            stmt="replay_graph(triton_graph, num_runs)",
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="triton_moe_cuda_graphs",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    # Warmup
-    run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights,
-                    topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2,
-                    num_warmup)
-
-    results.append(
-        benchmark.Timer(
-            stmt=
-            "run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="grouped_gemm_moe",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    # Warmup
-    replay_graph(cutlass_graph, num_warmup)
-
-    results.append(
-        benchmark.Timer(
-            stmt="replay_graph(cutlass_graph, num_runs)",
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="grouped_gemm_moe_cuda_graphs",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-
-def main(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    results: list[benchmark.Measurement] = []
-
-    for model in args.models:
-        for tp in args.tp_sizes:
-            for layer in WEIGHT_SHAPES_MOE[model]:
-                num_experts = layer[0]
-                topk = layer[1]
-                size_k = layer[2]
-                size_n = layer[3] // tp
-
-                if len(args.limit_k) > 0 and size_k not in args.limit_k:
-                    continue
-
-                if len(args.limit_n) > 0 and size_n not in args.limit_n:
-                    continue
-
-                for per_act_token in PER_ACT_TOKEN_OPTS:
-                    for per_out_ch in PER_OUT_CH_OPTS:
-                        for size_m in DEFAULT_BATCH_SIZES:
-                            mkn = (size_m, size_k, size_n)
-                            bench_run(results, model, num_experts, topk,
-                                      per_act_token, per_out_ch, mkn)
-
-    compare = benchmark.Compare(results)
-    compare.print()
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description="Benchmark Marlin across specified models/shapes/batches")
-    parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=DEFAULT_MODELS,
-        choices=WEIGHT_SHAPES_MOE.keys(),
-    )
-    parser.add_argument("--tp-sizes",
-                        nargs="+",
-                        type=int,
-                        default=DEFAULT_TP_SIZES)
-    parser.add_argument("--batch-sizes",
-                        nargs="+",
-                        type=int,
-                        default=DEFAULT_BATCH_SIZES)
-    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-per-act-token",
-                        nargs="+",
-                        type=int,
-                        default=[])
-    parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
-
-    args = parser.parse_args()
-    main(args)
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -17,14 +17,13 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES

-from vllm.triton_utils import HAS_TRITON
-
-if HAS_TRITON:
-    from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
-                                          lora_shrink)
-    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
-                                                _LORA_B_PTR_DICT)
-
+from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
 from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@ -168,25 +167,69 @@ class OpType(Enum):
    """
    LoRA Ops to benchmark and its properties.
    """
-    LORA_SHRINK = auto()
-    LORA_EXPAND = auto()
+    SGMV_SHRINK = auto()
+    BGMV_SHRINK = auto()
+    SGMV_EXPAND = auto()
+    BGMV_EXPAND = auto()
+    BGMV_EXPAND_SLICE = auto()
+    V1_SHRINK = auto()
+    V1_EXPAND = auto()

    @staticmethod
    def from_str(s: str) -> "OpType":
-        if s.lower() == "lora_shrink":
-            return OpType.LORA_SHRINK
-        if s.lower() == "lora_expand":
-            return OpType.LORA_EXPAND
+        if s.lower() == 'sgmv_shrink':
+            return OpType.SGMV_SHRINK
+        if s.lower() == 'sgmv_expand':
+            return OpType.SGMV_EXPAND
+        if s.lower() == 'bgmv_shrink':
+            return OpType.BGMV_SHRINK
+        if s.lower() == 'bgmv_expand':
+            return OpType.BGMV_EXPAND
+        if s.lower() == "bgmv_expand_slice":
+            return OpType.BGMV_EXPAND_SLICE
+        if s.lower() == "v1_shrink":
+            return OpType.V1_SHRINK
+        if s.lower() == "v1_expand":
+            return OpType.V1_EXPAND
        raise ValueError(f"Unrecognized str {s} to convert to OpType")

    def is_shrink_fn(self) -> bool:
-        return self in [OpType.LORA_SHRINK]
+        return self in [
+            OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
+        ]

    def is_expand_fn(self) -> bool:
-        return self in [OpType.LORA_EXPAND]
+        return self in [
+            OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
+        ]
+
+    def is_prefill_op(self) -> bool:
+        return self in [
+            OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
+            OpType.V1_EXPAND
+        ]
+
+    def is_decode_op(self) -> bool:
+        return self in [
+            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
+            OpType.V1_SHRINK, OpType.V1_EXPAND
+        ]
+
+    def is_expand_slice_fn(self) -> bool:
+        return self in [OpType.BGMV_EXPAND_SLICE]

    def num_slices(self) -> list[int]:
-        return [1, 2, 3]
+        if self in [
+                OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
+                OpType.V1_EXPAND
+        ]:
+            # SGMV kernels and v1 kernels supports slices
+            return [1, 2, 3]
+        if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
+            return [1]
+        if self in [OpType.BGMV_EXPAND_SLICE]:
+            return [2, 3]
+        raise ValueError(f"Unrecognized OpType {self}")

    def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
            lora_rank: int) -> tuple[int, int, int]:
@ -196,7 +239,7 @@ class OpType(Enum):
            k = hidden_size
            n = lora_rank
        else:
-            assert self.is_expand_fn()
+            assert self.is_expand_fn() or self.is_expand_slice_fn()
            m = num_tokens
            k = lora_rank
            n = hidden_size
@ -211,7 +254,7 @@ class OpType(Enum):
        if self.is_shrink_fn():
            return op_dtype, op_dtype, torch.float32
        else:
-            assert self.is_expand_fn()
+            assert self.is_expand_fn() or self.is_expand_slice_fn()
            return torch.float32, op_dtype, op_dtype

    def matmul_shapes(
@ -225,19 +268,43 @@ class OpType(Enum):
        m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)

        b_shape = (num_loras, n, k)  # col-major
-        if self in [OpType.LORA_SHRINK]:
-            # LoRA shrink kernels support num_slices inherently in the kernel.
+        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
+            # SGMV shrink and V1 shrink kernels support num_slices inherently
+            # in the kernel.
            return ((m, k), b_shape, (num_slices, m, n))
-        if self in [OpType.LORA_EXPAND]:
-            # LoRA expand kernels support num_slices inherently in the kernel
+        if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
+            # SGMV expand and V1 expand kernels support num_slices inherently
+            # in the kernel
            return ((num_slices, m, k), b_shape, (m, n * num_slices))
+        if self == OpType.BGMV_SHRINK:
+            return ((m, k), b_shape, (m, n))
+        if self == OpType.BGMV_EXPAND:
+            return ((m, k), b_shape, (m, n))
+        if self == OpType.BGMV_EXPAND_SLICE:
+            return ((num_slices, m, k), b_shape, (m, n * num_slices))
+
        raise ValueError(f"Unrecognized op_type {self}")

    def bench_fn(self) -> Callable:
-        if self == OpType.LORA_SHRINK:
-            return lora_shrink
-        if self == OpType.LORA_EXPAND:
-            return lora_expand
+
+        def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
+            for x in kwargs_list:
+                bgmv_expand_slice(**x)
+
+        if self == OpType.SGMV_SHRINK:
+            return sgmv_shrink
+        if self == OpType.SGMV_EXPAND:
+            return sgmv_expand
+        if self == OpType.BGMV_SHRINK:
+            return bgmv_shrink
+        if self == OpType.BGMV_EXPAND:
+            return bgmv_expand
+        if self == OpType.BGMV_EXPAND_SLICE:
+            return emulate_bgmv_expand_slice
+        if self == OpType.V1_SHRINK:
+            return v1_shrink
+        if self == OpType.V1_EXPAND:
+            return v1_expand

        raise ValueError(f"Unrecognized optype {self}")

@ -251,13 +318,34 @@ class OpType(Enum):
        """
        w_dtype = lora_weights[0].dtype
        num_slices = len(lora_weights)
-        if self in [OpType.LORA_SHRINK]:
+        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
            for slice_idx in range(num_slices):
                ref_group_gemm(ref_out=output[slice_idx, :],
                               input=input,
                               lora_weights=lora_weights[slice_idx],
                               **kwargs)
-        elif self in [OpType.LORA_EXPAND]:
+        elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
+            hidden_size = lora_weights[0].shape[1]
+            for slice_idx in range(num_slices):
+                slice_offset = slice_idx * hidden_size
+                ref_group_gemm(
+                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
+                    input=input[slice_idx].clone().to(dtype=w_dtype),
+                    lora_weights=lora_weights[slice_idx],
+                    **kwargs)
+        elif self == OpType.BGMV_SHRINK:
+            assert num_slices == 1
+            ref_group_gemm(ref_out=output,
+                           input=input,
+                           lora_weights=lora_weights[0],
+                           **kwargs)
+        elif self == OpType.BGMV_EXPAND:
+            assert num_slices == 1
+            ref_group_gemm(ref_out=output,
+                           input=input.clone().to(dtype=w_dtype),
+                           lora_weights=lora_weights[0],
+                           **kwargs)
+        elif self == OpType.BGMV_EXPAND_SLICE:
            hidden_size = lora_weights[0].shape[1]
            for slice_idx in range(num_slices):
                slice_offset = slice_idx * hidden_size
@ -323,11 +411,13 @@ class BenchmarkTensors:
    input: torch.Tensor
    lora_weights_lst: list[torch.Tensor]
    output: torch.Tensor
-    # LoRA kernel metadata
-    lora_kernel_meta: LoRAKernelMeta
-    # Metadata tensors used in testing correctness
+    # metadata tensors
    seq_lens: torch.Tensor
+    seq_start_loc: torch.Tensor
    prompt_lora_mapping: torch.Tensor
+    token_lora_mapping: torch.Tensor
+    # v1 kernel metadata
+    v1_kernel_meta: Optional[V1KernelMeta] = None

    def io_types(self) -> str:
        return (f"{dtype_to_str(self.input.dtype)}x"
@ -354,29 +444,35 @@ class BenchmarkTensors:
        assert ctx.num_active_loras <= ctx.num_loras
        total_tokens = ctx.batch_size * ctx.seq_length

-        # Make metadata tensors involved in correctness testing.
        # Prepare seq lens tensor
        seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
                                       (ctx.batch_size, ))
+        # Prepare seq_start_loc tensor
+        seq_start_loc_tensor = torch.cumsum(torch.tensor(
+            [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+                                            dim=0)
        assert total_tokens == seq_len_tensor.sum()
        # Prepare prompt lora indices tensor
        prompt_lora_indices_tensor = make_prompt_lora_mapping(
            ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
-
-        # Make LoRAKernelMeta
+        # Prepare token lora indices tensor
        token_lora_indices_tensor = make_token_lora_mapping(
            total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
            seq_len_tensor, "cpu")
-        lora_kernel_meta = LoRAKernelMeta.make(
-            max_loras=ctx.num_loras,
-            max_num_tokens=token_lora_indices_tensor.size(0),
-            device="cpu")
-        lora_kernel_meta.prepare_tensors(
-            token_lora_mapping=token_lora_indices_tensor)
+
+        v1_kernel_meta = None
+        if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
+            v1_kernel_meta = V1KernelMeta.make(
+                max_loras=ctx.num_loras,
+                max_num_tokens=token_lora_indices_tensor.size(0),
+                device="cpu")
+            v1_kernel_meta.prepare_tensors(
+                token_lora_mapping=token_lora_indices_tensor)

        return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
-                                lora_kernel_meta, seq_len_tensor,
-                                prompt_lora_indices_tensor)
+                                seq_len_tensor, seq_start_loc_tensor,
+                                prompt_lora_indices_tensor,
+                                token_lora_indices_tensor, v1_kernel_meta)

    def sanity_check(self) -> None:
        """
@ -386,9 +482,9 @@ class BenchmarkTensors:
        # check metadata tensors
        assert torch.sum(self.seq_lens) == num_tokens
        num_seqs = self.seq_lens.shape[0]
-        #assert self.seq_start_loc.shape[0] == num_seqs
+        assert self.seq_start_loc.shape[0] == num_seqs
        assert self.prompt_lora_mapping.shape[0] == num_seqs
-        assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
+        assert self.token_lora_mapping.shape[0] == num_tokens

    def to_device(self, device: str):
        """
@ -403,27 +499,220 @@ class BenchmarkTensors:
        self.input = to_device(self.input)
        self.output = to_device(self.output)
        self.seq_lens = to_device(self.seq_lens)
+        self.seq_start_loc = to_device(self.seq_start_loc)
        self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
+        self.token_lora_mapping = to_device(self.token_lora_mapping)
        for i in range(len(self.lora_weights_lst)):
            self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])

-        # LoRA meta
-        for field_name in LoRAKernelMeta.__dataclass_fields__:
-            field = getattr(self.lora_kernel_meta, field_name)
-            assert isinstance(field, torch.Tensor)
-            setattr(self.lora_kernel_meta, field_name, to_device(field))
+        # v1 meta
+        if self.v1_kernel_meta:
+            for field_name in V1KernelMeta.__dataclass_fields__:
+                field = getattr(self.v1_kernel_meta, field_name)
+                assert isinstance(field, torch.Tensor)
+                setattr(self.v1_kernel_meta, field_name, to_device(field))

    def metadata(self) -> tuple[int, int, int]:
        """
        Return num_seqs, num_tokens and max_seq_len
        """
        num_seqs = self.seq_lens.shape[0]
-        num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
+        num_tokens = self.token_lora_mapping.shape[0]
        max_seq_len = torch.max(self.seq_lens).item()
        num_slices = len(self.lora_weights_lst)
        return num_seqs, num_tokens, max_seq_len, num_slices

-    def as_lora_shrink_kwargs(self) -> dict[str, Any]:
+    def convert_to_sgmv_benchmark_tensors(self):
+        """
+        For sgmv punica kernels, when consecutive sequences have the
+        same LoRA ID, we just merge them together.
+        This happens in punica.py::compute_metadata
+        """
+
+        # Collapse seq_lens and seq_start_loc
+        _, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
+                                               return_counts=True)
+        cum_result = torch.cumsum(seq_lens, dim=0)
+        seq_start_loc = torch.zeros_like(seq_lens)
+        seq_start_loc[1:].copy_(cum_result[:-1])
+
+        # Collapse prompt mapping
+        prompt_lora_mapping = torch.unique_consecutive(
+            self.prompt_lora_mapping)
+
+        assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
+         f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
+
+        self.prompt_lora_mapping = prompt_lora_mapping.to(
+            dtype=self.prompt_lora_mapping.dtype)
+        self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
+        self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
+
+    def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
+        self.convert_to_sgmv_benchmark_tensors()
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, hidden_size]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == hidden_size
+        lora_rank = lw_shape[1]
+        # Expected output shape [num_slices, num_tokens, lora_rank]
+        assert len(o_shape) == 3
+        assert o_shape == (num_slices, num_tokens, lora_rank)
+
+        return {
+            'inputs': self.input,
+            'lora_a_weights': self.lora_weights_lst,
+            'output_tensor': self.output,
+            'b_seq_start_loc': self.seq_start_loc,
+            'seq_len_tensor': self.seq_lens,
+            'lora_indices_tensor': self.prompt_lora_mapping,
+            'batches': num_seqs,
+            'max_seq_length': max_seq_len,
+            'token_nums': num_tokens,
+            'scaling': 1.0,
+        }
+
+    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
+
+        self.convert_to_sgmv_benchmark_tensors()
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape : [num_slices, num_tokens, lora_rank]
+        assert len(i_shape) == 3
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[2]
+        # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape : [num_tokens, hidden_size * num_slices]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size * num_slices)
+
+        return {
+            'inputs': self.input,
+            'lora_b_weights': self.lora_weights_lst,
+            'output_tensor': self.output,
+            'b_seq_start_loc': self.seq_start_loc,
+            'seq_len_tensor': self.seq_lens,
+            'lora_indices_tensor': self.prompt_lora_mapping,
+            'batches': num_seqs,
+            'max_seq_length': max_seq_len,
+            'token_nums': num_tokens,
+            'offset_start': 0,
+            'add_inputs': add_inputs,
+        }
+
+    def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
+        assert len(self.lora_weights_lst) == 1
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, _ = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, hidden_size]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == hidden_size
+        lora_rank = lw_shape[1]
+        # Expected output shape [num_tokens, lora_rank]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, lora_rank)
+
+        return {
+            'inputs': self.input,
+            'lora_a_weights': self.lora_weights_lst[0],
+            'output_tensor': self.output,
+            'lora_indices_tensor': self.token_lora_mapping,
+            'scaling': 1.0
+        }
+
+    def as_bgmv_expand_kwargs(self, add_inputs: bool):
+        assert len(self.lora_weights_lst) == 1
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, _ = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, lora_rank]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        lora_rank = i_shape[1]
+        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape [num_tokens, hidden_size]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size)
+
+        return {
+            'inputs': self.input,
+            'lora_b_weights': self.lora_weights_lst[0],
+            'output_tensor': self.output,
+            'lora_indices_tensor': self.token_lora_mapping,
+            'add_inputs': add_inputs
+        }
+
+    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
+
+        _, num_tokens, _, num_slices = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_slices, num_tokens, lora_rank]
+        assert len(i_shape) == 3
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[2]
+        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape [num_tokens, hidden_size * num_slices]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size * num_slices)
+
+        self.to_device(self.input.device)
+
+        kwargs_list = []
+        for i in range(num_slices):
+            kwargs_list.append({
+                'inputs': self.input[i],
+                'lora_b_weights': self.lora_weights_lst[i],
+                'output_tensor': self.output,
+                'lora_indices_tensor': self.token_lora_mapping,
+                'slice_offset': i * hidden_size,
+                'slice_size': hidden_size,
+                'add_inputs': add_inputs,
+            })
+        return {'kwargs_list': kwargs_list}
+
+    def as_v1_shrink_kwargs(self) -> dict[str, Any]:
+        assert self.v1_kernel_meta is not None
        self.sanity_check()
        self.to_device(self.input.device)

@ -448,16 +737,17 @@ class BenchmarkTensors:
            'inputs': self.input,
            'lora_a_weights': self.lora_weights_lst,
            'output_tensor': self.output,
-            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
+            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
            'token_indices_sorted_by_lora_ids':
-            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
-            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
-            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
-            'lora_ids': self.lora_kernel_meta.active_lora_ids,
+            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
+            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
+            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
+            'lora_ids': self.v1_kernel_meta.active_lora_ids,
            'scaling': 1.0,
        }

-    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
+    def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
+        assert self.v1_kernel_meta is not None
        self.sanity_check()
        self.to_device(self.input.device)

@ -483,12 +773,12 @@ class BenchmarkTensors:
            'inputs': self.input,
            'lora_b_weights': self.lora_weights_lst,
            'output_tensor': self.output,
-            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
+            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
            'token_indices_sorted_by_lora_ids':
-            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
-            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
-            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
-            'lora_ids': self.lora_kernel_meta.active_lora_ids,
+            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
+            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
+            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
+            'lora_ids': self.v1_kernel_meta.active_lora_ids,
            'offset_start': 0,
            'add_inputs': add_inputs,
        }
@ -501,10 +791,20 @@ class BenchmarkTensors:
        else:
            assert add_inputs is not None

-        if op_type == OpType.LORA_SHRINK:
-            return self.as_lora_shrink_kwargs()
-        if op_type == OpType.LORA_EXPAND:
-            return self.as_lora_expand_kwargs(add_inputs)
+        if op_type == OpType.SGMV_SHRINK:
+            return self.as_sgmv_shrink_kwargs()
+        if op_type == OpType.SGMV_EXPAND:
+            return self.as_sgmv_expand_kwargs(add_inputs)
+        if op_type == OpType.BGMV_SHRINK:
+            return self.as_bgmv_shrink_kwargs()
+        if op_type == OpType.BGMV_EXPAND:
+            return self.as_bgmv_expand_kwargs(add_inputs)
+        if op_type == OpType.BGMV_EXPAND_SLICE:
+            return self.as_bgmv_expand_slice_kwargs(add_inputs)
+        if op_type == OpType.V1_SHRINK:
+            return self.as_v1_shrink_kwargs()
+        if op_type == OpType.V1_EXPAND:
+            return self.as_v1_expand_kwargs(add_inputs)
        raise ValueError(f"Unrecognized optype {self}")

    def test_correctness(self, op_type: OpType,
@ -693,6 +993,10 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
    for bench_ctx in bench_ctxs:
        for seq_len in args.seq_lengths:
            bench_ops: list[OpType] = args.op_types
+            if seq_len > 1:
+                # bench only prefill ops
+                bench_ops = [op for op in args.op_types if op.is_prefill_op()]
+
            seq_len_timers = []
            for bench_op in bench_ops:
                for num_slices in bench_op.num_slices():
@ -902,13 +1206,13 @@ Benchmark LoRA kernels:
    {use_cuda_graph_recommendation()}

    list_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32

    model_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
+        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 

    range_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
+        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
            """,  # noqa: E501
        formatter_class=argparse.RawTextHelpFormatter)

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -6,17 +6,16 @@ import time
 from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
-from types import SimpleNamespace
 from typing import Any, TypedDict

 import ray
 import torch
+import triton
 from ray.experimental.tqdm_ray import tqdm
+from transformers import AutoConfig

 from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
-from vllm.transformers_utils.config import get_config
-from vllm.triton_utils import triton
 from vllm.utils import FlexibleArgumentParser

 FP8_DTYPE = current_platform.fp8_dtype()
@ -31,18 +30,19 @@ class BenchmarkConfig(TypedDict):
    num_stages: int


-def benchmark_config(config: BenchmarkConfig,
-                     num_tokens: int,
-                     num_experts: int,
-                     shard_intermediate_size: int,
-                     hidden_size: int,
-                     topk: int,
-                     dtype: torch.dtype,
-                     use_fp8_w8a8: bool,
-                     use_int8_w8a16: bool,
-                     num_iters: int = 100,
-                     block_quant_shape: List[int] = None,
-                     use_deep_gemm: bool = False) -> float:
+def benchmark_config(
+    config: BenchmarkConfig,
+    num_tokens: int,
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    num_iters: int = 100,
+    block_quant_shape: List[int] = None,
+) -> float:
    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
    if use_int8_w8a16:
@ -115,41 +115,22 @@ def benchmark_config(config: BenchmarkConfig,
    def run():
        from vllm.model_executor.layers.fused_moe import override_config
        with override_config(config):
-            if use_deep_gemm:
-                topk_weights, topk_ids, token_expert_indices = fused_topk(
-                    x, input_gating, topk, False)
-                return fused_experts(
-                    x,
-                    w1,
-                    w2,
-                    topk_weights,
-                    topk_ids,
-                    inplace=True,
-                    use_fp8_w8a8=use_fp8_w8a8,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    a1_scale=a1_scale,
-                    a2_scale=a2_scale,
-                    block_shape=block_quant_shape,
-                    allow_deep_gemm=True,
-                )
-            else:
-                fused_moe(
-                    x,
-                    w1,
-                    w2,
-                    input_gating,
-                    topk,
-                    renormalize=True,
-                    inplace=True,
-                    use_fp8_w8a8=use_fp8_w8a8,
-                    use_int8_w8a16=use_int8_w8a16,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    a1_scale=a1_scale,
-                    a2_scale=a2_scale,
-                    block_shape=block_quant_shape,
-                )
+            fused_moe(
+                x,
+                w1,
+                w2,
+                input_gating,
+                topk,
+                renormalize=True,
+                inplace=True,
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                block_shape=block_quant_shape,
+            )

    # JIT compilation & warmup
    run()
@ -385,7 +366,6 @@ class BenchmarkWorker:
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
        block_quant_shape: List[int] = None,
-        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
        current_platform.seed_everything(self.seed)
        dtype_str = get_config_dtype_str(dtype,
@ -416,8 +396,7 @@ class BenchmarkWorker:
                                       use_fp8_w8a8,
                                       use_int8_w8a16,
                                       num_iters=100,
-                                       block_quant_shape=block_quant_shape,
-                                       use_deep_gemm=use_deep_gemm)
+                                       block_quant_shape=block_quant_shape)
        return config, kernel_time

    def tune(
@ -432,7 +411,6 @@ class BenchmarkWorker:
        use_int8_w8a16: bool,
        search_space: list[dict[str, int]],
        block_quant_shape: list[int],
-        use_deep_gemm: bool,
    ) -> dict[str, int]:
        best_config = None
        best_time = float("inf")
@ -443,14 +421,8 @@ class BenchmarkWorker:
                                                   hidden_size, search_space,
                                                   is_fp16, topk)

-        need_device_guard = False
-        if current_platform.is_rocm():
-            visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None)
-            if visible_device != f"{self.device_id}":
-                need_device_guard = True
-
-        with torch.cuda.device(
-                self.device_id) if need_device_guard else nullcontext():
+        with torch.cuda.device(self.device_id) if current_platform.is_rocm(
+        ) else nullcontext():
            for config in tqdm(search_space):
                try:
                    kernel_time = benchmark_config(
@ -464,8 +436,7 @@ class BenchmarkWorker:
                        use_fp8_w8a8,
                        use_int8_w8a16,
                        num_iters=20,
-                        block_quant_shape=block_quant_shape,
-                        use_deep_gemm=use_deep_gemm)
+                        block_quant_shape=block_quant_shape)
                except triton.runtime.autotuner.OutOfResources:
                    # Some configurations may be invalid and fail to compile.
                    continue
@ -534,13 +505,9 @@ def get_weight_block_size_safety(config, default_value=None):

 def main(args: argparse.Namespace):
    print(args)
-
-    config = get_config(model=args.model,
-                        trust_remote_code=args.trust_remote_code)
-    if args.model_prefix:
-        config = getattr(config, args.model_prefix)
-    config = SimpleNamespace(**config)
-
+    block_quant_shape = None
+    config = AutoConfig.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code)
    if config.architectures[0] == "DbrxForCausalLM":
        E = config.ffn_config.moe_num_experts
        topk = config.ffn_config.moe_top_k
@ -551,21 +518,19 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif (config.architectures[0]
-          in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM")):
+    elif (config.architectures[0] == "DeepseekV3ForCausalLM"
+          or config.architectures[0] == "DeepseekV2ForCausalLM"):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] in ("Qwen2MoeForCausalLM",
-                                     "Qwen3MoeForCausalLM"):
+        block_quant_shape = get_weight_block_size_safety(config)
+    elif config.architectures[0] == "Qwen2MoeForCausalLM":
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
-        # Support for llama4
-        config = config.get_text_config()
        # Default: Mixtral.
        E = config.num_local_experts
        topk = config.num_experts_per_tok
@ -573,11 +538,9 @@ def main(args: argparse.Namespace):
        shard_intermediate_size = 2 * intermediate_size // args.tp_size

    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else getattr(
-        torch, config.torch_dtype)
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
-    block_quant_shape = get_weight_block_size_safety(config)

    if args.batch_size is None:
        batch_sizes = [
@ -587,17 +550,6 @@ def main(args: argparse.Namespace):
    else:
        batch_sizes = [args.batch_size]

-    use_deep_gemm = bool(args.use_deep_gemm)
-
-    if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ:
-        # Ray will set ROCR_VISIBLE_DEVICES for device visibility
-        logger.warning(
-            "Ray uses ROCR_VISIBLE_DEVICES to control device accessibility."
-            "Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES.")
-        val = os.environ["HIP_VISIBLE_DEVICES"]
-        os.environ["ROCR_VISIBLE_DEVICES"] = val
-        del os.environ["HIP_VISIBLE_DEVICES"]
-
    ray.init()
    num_gpus = int(ray.available_resources()["GPU"])
    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@ -620,10 +572,10 @@ def main(args: argparse.Namespace):

        start = time.time()
        configs = _distribute(
-            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
-                      block_quant_shape, use_deep_gemm)
-                     for batch_size in batch_sizes])
+            "tune",
+            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
+              use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
+             for batch_size in batch_sizes])
        best_configs = {
            M: sort_config(config)
            for M, config in zip(batch_sizes, configs)
@ -637,7 +589,7 @@ def main(args: argparse.Namespace):
        outputs = _distribute(
            "benchmark",
            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
+              use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
             for batch_size in batch_sizes])

        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@ -659,12 +611,10 @@ if __name__ == "__main__":
                        type=str,
                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
                        default="auto")
-    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--tune", action="store_true")
    parser.add_argument("--trust-remote-code", action="store_true")
-    parser.add_argument("--model-prefix", type=str, required=False)
    args = parser.parse_args()

    main(args)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -1,349 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-from typing import Any, TypedDict
-
-import ray
-import torch
-from transformers import AutoConfig
-
-from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
-    _moe_permute, _moe_unpermute_and_reduce)
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
-from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
-from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
-
-FP8_DTYPE = current_platform.fp8_dtype()
-
-
-class BenchmarkConfig(TypedDict):
-    BLOCK_SIZE_M: int
-    BLOCK_SIZE_N: int
-    BLOCK_SIZE_K: int
-    GROUP_SIZE_M: int
-    num_warps: int
-    num_stages: int
-
-
-def benchmark_permute(num_tokens: int,
-                      num_experts: int,
-                      hidden_size: int,
-                      topk: int,
-                      dtype: torch.dtype,
-                      use_fp8_w8a8: bool,
-                      use_int8_w8a16: bool,
-                      num_iters: int = 100,
-                      use_customized_permute: bool = False) -> float:
-    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
-    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    # output_hidden_states = torch.empty_like(hidden_states)
-    if use_fp8_w8a8:
-        align_block_size = 128  # deepgemm needs 128 m aligned block
-        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
-    else:
-        align_block_size = None
-        qhidden_states = hidden_states
-
-    gating_output = torch.randn(num_iters,
-                                num_tokens,
-                                num_experts,
-                                dtype=torch.float32)
-
-    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
-    topk_weights, topk_ids, token_expert_indices = fused_topk(
-        qhidden_states, input_gating, topk, False)
-
-    def prepare(i: int):
-        input_gating.copy_(gating_output[i])
-
-    def run():
-        if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx,
-             m_indices) = moe_permute(
-                 qhidden_states,
-                 topk_weights=topk_weights,
-                 topk_ids=topk_ids,
-                 token_expert_indices=token_expert_indices,
-                 topk=topk,
-                 n_expert=num_experts,
-                 n_local_expert=num_experts,
-                 expert_map=None,
-                 align_block_size=align_block_size,
-             )
-        else:
-            (permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
-             inv_perm) = _moe_permute(qhidden_states, None, topk_ids,
-                                      num_experts, None, align_block_size)
-
-    # JIT compilation & warmup
-    run()
-    torch.cuda.synchronize()
-
-    # Capture 10 invocations with CUDA graph
-    graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(graph):
-        for _ in range(10):
-            run()
-    torch.cuda.synchronize()
-
-    # Warmup
-    for _ in range(5):
-        graph.replay()
-    torch.cuda.synchronize()
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    latencies: list[float] = []
-    for i in range(num_iters):
-        prepare(i)
-        torch.cuda.synchronize()
-
-        start_event.record()
-        graph.replay()
-        end_event.record()
-        end_event.synchronize()
-        latencies.append(start_event.elapsed_time(end_event))
-    avg = sum(latencies) / (num_iters * 10) * 1000  # us
-    graph.reset()
-    return avg
-
-
-def benchmark_unpermute(num_tokens: int,
-                        num_experts: int,
-                        hidden_size: int,
-                        topk: int,
-                        dtype: torch.dtype,
-                        use_fp8_w8a8: bool,
-                        use_int8_w8a16: bool,
-                        num_iters: int = 100,
-                        use_customized_permute: bool = False) -> float:
-    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
-    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    output_hidden_states = torch.empty_like(hidden_states)
-    if use_fp8_w8a8:
-        align_block_size = 128  # deepgemm needs 128 m aligned block
-        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
-    else:
-        align_block_size = None
-        qhidden_states = hidden_states
-
-    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
-
-    topk_weights, topk_ids, token_expert_indices = fused_topk(
-        qhidden_states, input_gating, topk, False)
-
-    def prepare():
-        if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx,
-             m_indices) = moe_permute(
-                 qhidden_states,
-                 topk_weights=topk_weights,
-                 topk_ids=topk_ids,
-                 token_expert_indices=token_expert_indices,
-                 topk=topk,
-                 n_expert=num_experts,
-                 n_local_expert=num_experts,
-                 expert_map=None,
-                 align_block_size=align_block_size,
-             )
-            # convert to fp16/bf16 as gemm output
-            return (permuted_hidden_states.to(dtype), first_token_off,
-                    inv_perm_idx, m_indices)
-        else:
-            (permuted_qhidden_states, a1q_scale, sorted_token_ids, expert_ids,
-             inv_perm) = _moe_permute(qhidden_states, None, topk_ids,
-                                      num_experts, None, align_block_size)
-            # convert to fp16/bf16 as gemm output
-            return (permuted_qhidden_states.to(dtype), a1q_scale,
-                    sorted_token_ids, expert_ids, inv_perm)
-
-    def run(input: tuple):
-        if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx,
-             m_indices) = input
-            moe_unpermute(permuted_hidden_states, topk_weights, topk_ids,
-                          inv_perm_idx, first_token_off, topk, num_experts,
-                          num_experts)
-        else:
-            (permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
-             inv_perm) = input
-            _moe_unpermute_and_reduce(output_hidden_states,
-                                      permuted_hidden_states, inv_perm,
-                                      topk_weights)
-
-    # JIT compilation & warmup
-    input = prepare()
-    run(input)
-    torch.cuda.synchronize()
-
-    # Capture 10 invocations with CUDA graph
-    graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(graph):
-        for _ in range(10):
-            run(input)
-    torch.cuda.synchronize()
-
-    # Warmup
-    for _ in range(5):
-        graph.replay()
-    torch.cuda.synchronize()
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    latencies: list[float] = []
-    for i in range(num_iters):
-        torch.cuda.synchronize()
-        start_event.record()
-        graph.replay()
-        end_event.record()
-        end_event.synchronize()
-        latencies.append(start_event.elapsed_time(end_event))
-    avg = sum(latencies) / (num_iters * 10) * 1000  # us
-    graph.reset()
-    return avg
-
-
-@ray.remote(num_gpus=1)
-class BenchmarkWorker:
-
-    def __init__(self, seed: int) -> None:
-        torch.set_default_device("cuda")
-        current_platform.seed_everything(seed)
-        self.seed = seed
-        # Get the device ID to allocate tensors and kernels
-        # on the respective GPU. This is required for Ray to work
-        # correctly with multi-GPU tuning on the ROCm platform.
-        self.device_id = int(ray.get_gpu_ids()[0])
-
-    def benchmark(
-        self,
-        num_tokens: int,
-        num_experts: int,
-        hidden_size: int,
-        topk: int,
-        dtype: torch.dtype,
-        use_fp8_w8a8: bool,
-        use_int8_w8a16: bool,
-        use_customized_permute: bool = False,
-    ) -> tuple[dict[str, int], float]:
-        current_platform.seed_everything(self.seed)
-
-        permute_time = benchmark_permute(
-            num_tokens,
-            num_experts,
-            hidden_size,
-            topk,
-            dtype,
-            use_fp8_w8a8,
-            use_int8_w8a16,
-            num_iters=100,
-            use_customized_permute=use_customized_permute)
-        unpermute_time = benchmark_unpermute(
-            num_tokens,
-            num_experts,
-            hidden_size,
-            topk,
-            dtype,
-            use_fp8_w8a8,
-            use_int8_w8a16,
-            num_iters=100,
-            use_customized_permute=use_customized_permute)
-        return permute_time, unpermute_time
-
-
-def get_weight_block_size_safety(config, default_value=None):
-
-    quantization_config = getattr(config, 'quantization_config', {})
-    if isinstance(quantization_config, dict):
-        return quantization_config.get('weight_block_size', default_value)
-    return default_value
-
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    config = AutoConfig.from_pretrained(
-        args.model, trust_remote_code=args.trust_remote_code)
-    if config.architectures[0] == "DbrxForCausalLM":
-        E = config.ffn_config.moe_num_experts
-        topk = config.ffn_config.moe_top_k
-    elif config.architectures[0] == "JambaForCausalLM":
-        E = config.num_experts
-        topk = config.num_experts_per_tok
-    elif (config.architectures[0] == "DeepseekV3ForCausalLM"
-          or config.architectures[0] == "DeepseekV2ForCausalLM"):
-        E = config.n_routed_experts
-        topk = config.num_experts_per_tok
-    elif config.architectures[0] in [
-            "Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"
-    ]:
-        E = config.num_experts
-        topk = config.num_experts_per_tok
-
-    else:
-        # Support for llama4
-        config = config.get_text_config()
-        # Default: Mixtral.
-        E = config.num_local_experts
-        topk = config.num_experts_per_tok
-
-    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
-    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
-    use_int8_w8a16 = args.dtype == "int8_w8a16"
-    use_customized_permute = args.use_customized_permute
-
-    if args.batch_size is None:
-        batch_sizes = [
-            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
-            2048, 3072, 4096
-        ]
-    else:
-        batch_sizes = [args.batch_size]
-
-    ray.init()
-    num_gpus = int(ray.available_resources()["GPU"])
-    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
-
-    def _distribute(method: str, inputs: list[Any]) -> list[Any]:
-        outputs = []
-        worker_idx = 0
-        for input_args in inputs:
-            worker = workers[worker_idx]
-            worker_method = getattr(worker, method)
-            output = worker_method.remote(*input_args)
-            outputs.append(output)
-            worker_idx = (worker_idx + 1) % num_gpus
-        return ray.get(outputs)
-
-    outputs = _distribute(
-        "benchmark", [(batch_size, E, hidden_size, topk, dtype, use_fp8_w8a8,
-                       use_int8_w8a16, use_customized_permute)
-                      for batch_size in batch_sizes])
-
-    for batch_size, (permute, unpermute) in zip(batch_sizes, outputs):
-        print(f"Batch size: {batch_size}")
-        print(f"Permute time: {permute:.2f} us")
-        print(f"Unpermute time: {unpermute:.2f} us")
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser.add_argument("--model",
-                        type=str,
-                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
-                        default="auto")
-    parser.add_argument("--use-customized-permute", action="store_true")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, required=False)
-    parser.add_argument("--trust-remote-code", action="store_true")
-    args = parser.parse_args()
-
-    main(args)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -7,13 +7,10 @@ from typing import Optional
 import torch

 from vllm import _custom_ops as ops
-from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
                        create_kv_caches_with_random)

-logger = init_logger(__name__)
-
 NUM_BLOCKS = 128 * 1024
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
@ -196,9 +193,6 @@ def main(


 if __name__ == '__main__':
-    logger.warning("This script benchmarks the paged attention kernel. "
-                   "By default this is no longer used in vLLM inference.")
-
    parser = FlexibleArgumentParser(
        description="Benchmark the paged attention kernel.")
    parser.add_argument("--version",
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@ -4,11 +4,11 @@ import itertools
 from typing import Optional, Union

 import torch
+import triton
 from flashinfer.norm import fused_add_rmsnorm, rmsnorm
 from torch import nn

 from vllm import _custom_ops as vllm_ops
-from vllm.triton_utils import triton


 class HuggingFaceRMSNorm(nn.Module):
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@ -75,19 +75,3 @@ WEIGHT_SHAPES = {
        [7168, 8192],
    ],
 }
-
-WEIGHT_SHAPES_MOE = {
-    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
-        [8, 2, 4096, 28672],
-        [8, 2, 14336, 4096],
-    ],
-    "nm-testing/deepseekv2-lite": [
-        [64, 6, 2048, 1408],
-    ],
-    "ibm-granite/granite-3.0-1b-a400m": [
-        [32, 8, 1024, 1024],
-    ],
-    "ibm-granite/granite-3.0-3b-a800m": [
-        [40, 8, 1024, 1536],
-    ],
-}
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -1,420 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Adapted from sglang quantization/tuning_block_wise_kernel.py
-
-import argparse
-import json
-import multiprocessing as mp
-import os
-import time
-from datetime import datetime
-from typing import Any
-
-import torch
-import tqdm
-import triton
-
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    _w8a8_block_fp8_matmul)
-from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
-
-mp.set_start_method("spawn", force=True)
-
-assert current_platform.is_cuda(
-), "Only support tune w8a8 block fp8 kernel on CUDA device."
-
-DTYPE_MAP = {
-    "float32": torch.float32,
-    "float16": torch.float16,
-    "half": torch.half,
-    "bfloat16": torch.bfloat16,
-}
-
-
-def w8a8_block_matmul(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    block_size: list[int],
-    config: dict[str, Any],
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-    """This function performs matrix multiplication with 
-    block-wise quantization.
-
-    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
-    The output is returned in the specified `output_dtype`.
-
-    Args:
-        A: The input tensor, e.g., activation.
-        B: The input tensor, e.g., weight.
-        As: The per-token-group quantization scale for `A`.
-        Bs: The per-block quantization scale for `B`.
-        block_size: The block size for per-block quantization. 
-                    It should be 2-dim, e.g., [128, 128].
-        output_dytpe: The dtype of the returned tensor.
-
-    Returns:
-        torch.Tensor: The result of matmul.
-    """
-    assert len(block_size) == 2
-    block_n, block_k = block_size[0], block_size[1]
-
-    assert A.shape[-1] == B.shape[-1]
-    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
-    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
-    M = A.numel() // A.shape[-1]
-
-    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
-    N, K = B.shape
-    assert triton.cdiv(N, block_n) == Bs.shape[0]
-    assert triton.cdiv(K, block_k) == Bs.shape[1]
-
-    C_shape = A.shape[:-1] + (N, )
-    C = A.new_empty(C_shape, dtype=output_dtype)
-
-    def grid(META):
-        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
-                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
-
-    if A.dtype == torch.float8_e4m3fn:
-        kernel = _w8a8_block_fp8_matmul
-    else:
-        raise RuntimeError(
-            "Currently, only support tune w8a8 block fp8 kernel.")
-
-    kernel[grid](
-        A,
-        B,
-        C,
-        As,
-        Bs,
-        M,
-        N,
-        K,
-        block_n,
-        block_k,
-        A.stride(-2),
-        A.stride(-1),
-        B.stride(1),
-        B.stride(0),
-        C.stride(-2),
-        C.stride(-1),
-        As.stride(-2),
-        As.stride(-1),
-        Bs.stride(1),
-        Bs.stride(0),
-        **config,
-    )
-
-    return C
-
-
-def get_configs_compute_bound():
-    configs = []
-    for num_stages in [2, 3, 4, 5]:
-        for block_m in [16, 32, 64, 128, 256]:
-            for block_k in [64, 128]:
-                for block_n in [32, 64, 128, 256]:
-                    for num_warps in [4, 8]:
-                        for group_size in [1, 16, 32, 64]:
-                            configs.append({
-                                "BLOCK_SIZE_M": block_m,
-                                "BLOCK_SIZE_N": block_n,
-                                "BLOCK_SIZE_K": block_k,
-                                "GROUP_SIZE_M": group_size,
-                                "num_warps": num_warps,
-                                "num_stages": num_stages,
-                            })
-    return configs
-
-
-def get_weight_shapes(tp_size):
-    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
-    # Modify them, if you tune for another different model.
-    # cannot TP
-    total = [
-        (512 + 64, 7168),
-        ((128 + 64) * 128, 7168),
-        (128 * (128 + 128), 512),
-        (7168, 16384),
-        (7168, 18432),
-    ]
-    # N can TP
-    n_tp = [
-        (18432 * 2, 7168),
-        ((128 + 64) * 128, 7168),
-        (128 * (128 + 128), 512),
-        (24576, 1536),
-        (12288, 7168),
-        (4096, 7168),
-    ]
-    # K can TP
-    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
-
-    weight_shapes = []
-    for t in total:
-        weight_shapes.append(t)
-    for n_t in n_tp:
-        new_t = (n_t[0] // tp_size, n_t[1])
-        weight_shapes.append(new_t)
-    for k_t in k_tp:
-        new_t = (k_t[0], k_t[1] // tp_size)
-        weight_shapes.append(new_t)
-    return weight_shapes
-
-
-def benchmark_config(A,
-                     B,
-                     As,
-                     Bs,
-                     block_size,
-                     config,
-                     out_dtype=torch.float16,
-                     num_iters=10):
-
-    def run():
-        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
-
-    torch.cuda.synchronize()
-    # JIT complication & warmup
-    for _ in range(5):
-        run()
-    torch.cuda.synchronize()
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    latencies: list[float] = []
-    for i in range(num_iters):
-        torch.cuda.synchronize()
-        start_event.record()
-        run()
-        end_event.record()
-        end_event.synchronize()
-        latencies.append(start_event.elapsed_time(end_event))
-    avg = sum(latencies) / (num_iters * 10) * 1000  # us
-    return avg
-
-
-def tune(M, N, K, block_size, out_dtype, search_space, input_type):
-    factor_for_scale = 1e-2
-
-    if input_type == "fp8":
-        fp8_info = torch.finfo(torch.float8_e4m3fn)
-        fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-        A_fp32 = (
-            (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
-            fp8_max)
-        A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-
-        B_fp32 = (
-            (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
-            fp8_max)
-        B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-    else:
-        raise RuntimeError(
-            "Currently, only support tune w8a8 block fp8 kernel.")
-
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles = (N + block_n - 1) // block_n
-    k_tiles = (K + block_k - 1) // block_k
-
-    As = torch.rand(M, k_tiles, dtype=torch.float32,
-                    device="cuda") * factor_for_scale
-    Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") *
-          factor_for_scale)
-
-    best_config = None
-    best_time = float("inf")
-    for config in tqdm(search_space):
-        try:
-            kernel_time = benchmark_config(
-                A,
-                B,
-                As,
-                Bs,
-                block_size,
-                config,
-                out_dtype,
-                num_iters=10,
-            )
-        except triton.runtime.autotuner.OutOfResources:
-            # Some configurations may be invalid and fail to compile.
-            continue
-
-        if kernel_time < best_time:
-            best_time = kernel_time
-            best_config = config
-    now = datetime.now()
-    print(f"{now.ctime()}] Completed tuning for batch_size={M}")
-    assert best_config is not None
-    return best_config
-
-
-def save_configs(
-    N,
-    K,
-    block_n,
-    block_k,
-    configs,
-    save_path,
-    input_type="fp8",
-) -> None:
-    os.makedirs(save_path, exist_ok=True)
-    device_name = current_platform.get_device_name().replace(" ", "_")
-    json_file_name = (
-        f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
-        f"block_shape=[{block_n},{block_k}].json")
-
-    config_file_path = os.path.join(save_path, json_file_name)
-    print(f"Writing best config to {config_file_path}...")
-
-    with open(config_file_path, "w") as f:
-        json.dump(configs, f, indent=4)
-        f.write("\n")
-
-
-def tune_on_gpu(args_dict):
-    """Run tuning on a specific GPU."""
-    gpu_id = args_dict["gpu_id"]
-    batch_sizes = args_dict["batch_sizes"]
-    weight_shapes = args_dict["weight_shapes"]
-    args = args_dict["args"]
-
-    torch.cuda.set_device(gpu_id)
-    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
-
-    block_n = args.block_n
-    block_k = args.block_k
-    out_dtype = DTYPE_MAP[args.out_dtype]
-    save_path = args.save_path
-    input_type = args.input_type
-
-    search_space = get_configs_compute_bound()
-    search_space = [
-        config for config in search_space
-        if block_k % config["BLOCK_SIZE_K"] == 0
-    ]
-
-    start = time.time()
-    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
-        N, K = shape[0], shape[1]
-        print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
-        benchmark_results = [
-            tune(
-                batch_size,
-                N,
-                K,
-                [block_n, block_k],
-                out_dtype,
-                search_space,
-                input_type,
-            ) for batch_size in tqdm(batch_sizes,
-                                     desc=f"GPU {gpu_id} - Batch sizes")
-        ]
-        best_configs = {
-            M: config
-            for M, config in zip(batch_sizes, benchmark_results)
-        }
-        save_configs(N, K, block_n, block_k, best_configs, save_path,
-                     input_type)
-
-    end = time.time()
-    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
-
-
-def distribute_batch_sizes(batch_sizes, num_gpus):
-    """Distribute batch sizes across available GPUs."""
-    batches_per_gpu = []
-    for i in range(num_gpus):
-        start_idx = i * len(batch_sizes) // num_gpus
-        end_idx = (i + 1) * len(batch_sizes) // num_gpus
-        batches_per_gpu.append(batch_sizes[start_idx:end_idx])
-    return batches_per_gpu
-
-
-def main(args):
-    print(args)
-    num_gpus = torch.cuda.device_count()
-    if num_gpus == 0:
-        raise RuntimeError("No GPU available for tuning")
-    print(f"Found {num_gpus} GPUs for parallel tuning")
-
-    torch.cuda.init()
-
-    if args.batch_size is None:
-        batch_sizes = [
-            1,
-            2,
-            4,
-            8,
-            16,
-            24,
-            32,
-            48,
-            64,
-            96,
-            128,
-            256,
-            512,
-            1024,
-            1536,
-            2048,
-            3072,
-            4096,
-        ]
-    else:
-        batch_sizes = [args.batch_size]
-        num_gpus = 1  # If only one batch size, use only one GPU
-
-    weight_shapes = get_weight_shapes(args.tp_size)
-
-    batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
-
-    process_args = []
-    for gpu_id in range(num_gpus):
-        process_args.append({
-            "gpu_id": gpu_id,
-            "batch_sizes": batches_per_gpu[gpu_id],
-            "weight_shapes":
-            weight_shapes,  # Each GPU processes all weight shapes
-            "args": args,
-        })
-
-    ctx = mp.get_context("spawn")
-    with ctx.Pool(num_gpus) as pool:
-        pool.map(tune_on_gpu, process_args)
-
-    print("Multi-GPU tuning completed")
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description="""
-Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
-    python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
-Then copy to model_executor/layers/quantization/utils/configs
-        """,
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--tp-size", "-tp", type=int, default=8)
-    parser.add_argument("--input-type",
-                        type=str,
-                        choices=["fp8"],
-                        default="fp8")
-    parser.add_argument(
-        "--out-dtype",
-        type=str,
-        choices=["float32", "float16", "bfloat16", "half"],
-        default="float16",
-    )
-    parser.add_argument("--block-n", type=int, default=128)
-    parser.add_argument("--block-k", type=int, default=128)
-    parser.add_argument("--batch-size", type=int, required=False)
-    parser.add_argument("--save-path", type=str, default="./")
-    args = parser.parse_args()
-
-    main(args)
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -6,13 +6,13 @@ import time
 # Import DeepGEMM functions
 import deep_gemm
 import torch
+import triton
 from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor

 # Import vLLM functions
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
-from vllm.triton_utils import triton


 # Copied from
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+PORT=8000
+MODEL=$1
+TOKENS=$2
+
+docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
+           -v "$PWD/data:/data" \
+           ghcr.io/huggingface/text-generation-inference:2.2.0 \
+           --model-id "$MODEL" \
+           --sharded false  \
+           --max-input-length 1024 \
+           --max-total-tokens 2048 \
+           --max-best-of 5 \
+           --max-concurrent-requests 5000 \
+           --max-batch-total-tokens "$TOKENS"
--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@ -9,10 +9,13 @@ BACKEND=${2:-"vllm"}
 # Define the dataset to use
 DATASET=${3:-"xgrammar_bench"}

-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-OUTPUT_DIR=${4:-"$SCRIPT_DIR/structured_output_benchmark_results"}
+# Define the guided decoding backend
+GUIDED_BACKEND=${4:-"xgrammar"}

-GUIDED_RATIO=${5:-0.5}
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUTPUT_DIR=${5:-"$SCRIPT_DIR/structured_output_benchmark_results"}
+
+GUIDED_RATIO=${6:-0.5}

 # Create output directory if it doesn't exist
 mkdir -p "$OUTPUT_DIR"
@ -24,6 +27,7 @@ QPS_VALUES=(70 60 50 25 20 15 10)
 COMMON_PARAMS="--backend $BACKEND \
               --model $MODEL \
               --dataset $DATASET \
+               --structured-output-backend $GUIDED_BACKEND \
               --structured-output-ratio $GUIDED_RATIO \
               --save-results \
               --result-dir $OUTPUT_DIR"
@ -31,6 +35,7 @@ COMMON_PARAMS="--backend $BACKEND \
 echo "Starting structured output benchmark with model: $MODEL"
 echo "Backend: $BACKEND"
 echo "Dataset: $DATASET"
+echo "Structured output backend: $GUIDED_BACKEND"
 echo "Results will be saved to: $OUTPUT_DIR"
 echo "----------------------------------------"

@ -43,13 +48,12 @@ for qps in "${QPS_VALUES[@]}"; do
  GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")

  # Construct filename for this run
-  FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
+  FILENAME="${GUIDED_BACKEND}_${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"

  # Run the benchmark
  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
    --request-rate $qps \
    --result-filename "$FILENAME" \
-    --tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
    --port ${PORT:-8000}

  echo "Completed benchmark with QPS: $qps"
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -33,6 +33,8 @@ endif()

 if(MACOSX_FOUND)
    list(APPEND CXX_COMPILE_FLAGS
+        "-Xpreprocessor"
+        "-fopenmp"
        "-DVLLM_CPU_EXTENSION")
 else()
    list(APPEND CXX_COMPILE_FLAGS
@ -167,33 +169,6 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)

    FetchContent_MakeAvailable(oneDNN)
    
-    list(APPEND LIBS dnnl)
-elseif(POWER10_FOUND)
-    FetchContent_Declare(
-        oneDNN
-        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG v3.7.2
-        GIT_PROGRESS TRUE
-        GIT_SHALLOW TRUE
-    )
-
-    set(ONEDNN_LIBRARY_TYPE "STATIC")
-    set(ONEDNN_BUILD_DOC "OFF")
-    set(ONEDNN_BUILD_EXAMPLES "OFF")
-    set(ONEDNN_BUILD_TESTS "OFF")
-    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
-    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
-    set(ONEDNN_BUILD_GRAPH "OFF")
-    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
-    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
-    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
-    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
-    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
-
-    set(DNNL_CPU_RUNTIME "OMP")
-
-    FetchContent_MakeAvailable(oneDNN)
-
    list(APPEND LIBS dnnl)
 endif()

@ -215,16 +190,10 @@ set(VLLM_EXT_SRC
    "csrc/cpu/cache.cpp"
    "csrc/cpu/utils.cpp"
    "csrc/cpu/layernorm.cpp"
-    "csrc/cpu/mla_decode.cpp"
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/torch_bindings.cpp")

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
-    set(VLLM_EXT_SRC
-        "csrc/cpu/quant.cpp"
-        "csrc/cpu/shm.cpp"
-        ${VLLM_EXT_SRC})
-elseif(POWER10_FOUND)
    set(VLLM_EXT_SRC
        "csrc/cpu/quant.cpp"
        ${VLLM_EXT_SRC})
@ -245,4 +214,4 @@ define_gpu_extension_target(
    WITH_SOABI
 )

-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
+          GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9 
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@ -105,14 +105,8 @@ def run(command):
    else:
        enc = locale.getpreferredencoding()
    output = raw_output.decode(enc)
-    if command == 'nvidia-smi topo -m':
-        # don't remove the leading whitespace of `nvidia-smi topo -m`
-        #   because they are meaningful
-        output = output.rstrip()
-    else:
-        output = output.strip()
    err = raw_err.decode(enc)
-    return rc, output, err.strip()
+    return rc, output.strip(), err.strip()


 def run_and_read_all(run_lambda, command):
@ -282,20 +276,12 @@ def get_vllm_version():

    if __version__ == "dev":
        return "N/A (dev)"
-    version_str = __version_tuple__[-1]
-    if isinstance(version_str, str) and version_str.startswith('g'):
-        # it's a dev build
-        if '.' in version_str:
-            # it's a dev build containing local changes
-            git_sha = version_str.split('.')[0][1:]
-            date = version_str.split('.')[-1][1:]
-            return f"{__version__} (git sha: {git_sha}, date: {date})"
-        else:
-            # it's a dev build without local changes
-            git_sha = version_str[1:]  # type: ignore
-            return f"{__version__} (git sha: {git_sha})"
-    return __version__

+    if len(__version_tuple__) == 4: # dev build
+        git_sha = __version_tuple__[-1][1:] # type: ignore
+        return f"{__version__} (git sha: {git_sha}"
+
+    return __version__

 def summarize_vllm_build_flags():
    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
@ -496,30 +482,16 @@ def get_pip_packages(run_lambda, patterns=None):
    if patterns is None:
        patterns = DEFAULT_PIP_PATTERNS

-    def run_with_pip():
-        try:
-            import importlib.util
-            pip_spec = importlib.util.find_spec('pip')
-            pip_available = pip_spec is not None
-        except ImportError:
-            pip_available = False
-
-        if pip_available:
-            cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
-        elif os.environ.get("UV") is not None:
-            print("uv is set")
-            cmd = ["uv", "pip", "list", "--format=freeze"]
-        else:
-            raise RuntimeError(
-                "Could not collect pip list output (pip or uv module not available)"
-            )
-
-        out = run_and_read_all(run_lambda, cmd)
+    # People generally have `pip` as `pip` or `pip3`
+    # But here it is invoked as `python -mpip`
+    def run_with_pip(pip):
+        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
        return "\n".join(line for line in out.splitlines()
                         if any(name in line for name in patterns))

    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
-    out = run_with_pip()
+    out = run_with_pip([sys.executable, '-mpip'])
+
    return pip_version, out


@ -545,12 +517,13 @@ def is_xnnpack_available():
    else:
        return "N/A"

-
 def get_env_vars():
    env_vars = ''
-    secret_terms = ('secret', 'token', 'api', 'access', 'password')
-    report_prefix = ("TORCH", "NCCL", "PYTORCH", "CUDA", "CUBLAS", "CUDNN",
-                     "OMP_", "MKL_", "NVIDIA")
+    secret_terms=('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "NCCL", "PYTORCH",
+                     "CUDA", "CUBLAS", "CUDNN",
+                     "OMP_", "MKL_",
+                     "NVIDIA")
    for k, v in os.environ.items():
        if any(term in k.lower() for term in secret_terms):
            continue
@ -561,7 +534,6 @@ def get_env_vars():

    return env_vars

-
 def get_env_info():
    run_lambda = run
    pip_version, pip_list_output = get_pip_packages(run_lambda)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Tyler Michael Smith	031c8b32a4	Add time comment Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-03-17 13:50:44 +00:00
Tyler Michael Smith	ac08d45200	Merge branch 'main' into mamba_tests	2025-03-17 13:49:56 +00:00
Tyler Michael Smith	a5d29e9ee1	undo massive formatting change Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-03-15 17:31:21 +00:00
Tyler Michael Smith	696245c2fc	Add SSM and Hybrid Models Test Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-03-15 17:26:01 +00:00