Reduce Scatter Plumbing

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-02-28 16:33:52 +00:00
2093 changed files with 67528 additions and 211906 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -8,12 +8,12 @@ import zipfile
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 def print_top_10_largest_files(zip_file):
    """Print the top 10 largest files in the given zip file."""
-    with zipfile.ZipFile(zip_file, "r") as z:
+    with zipfile.ZipFile(zip_file, 'r') as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
@ -28,18 +28,14 @@ def check_wheel_size(directory):
                wheel_path = os.path.join(root, file_name)
                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
                if wheel_size_mb > VLLM_MAX_SIZE_MB:
-                    print(
+                    print(f"Not allowed: Wheel {wheel_path} is larger "
-                        f"Not allowed: Wheel {wheel_path} is larger "
+                          f"({wheel_size_mb:.2f} MB) than the limit "
-                        f"({wheel_size_mb:.2f} MB) than the limit "
+                          f"({VLLM_MAX_SIZE_MB} MB).")
                        f"({VLLM_MAX_SIZE_MB} MB)."
                    )
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
-                    print(
+                    print(f"Wheel {wheel_path} is within the allowed size "
-                        f"Wheel {wheel_path} is within the allowed size "
+                          f"({wheel_size_mb:.2f} MB).")
                        f"({wheel_size_mb:.2f} MB)."
                    )
    return 0
@ -49,4 +45,4 @@ if __name__ == "__main__":
        sys.exit(1)
    directory = sys.argv[1]
-    sys.exit(check_wheel_size(directory))
+    sys.exit(check_wheel_size(directory))
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -22,5 +22,5 @@ with open("index.html", "w") as f:
    print(f"Generated index.html for {args.wheel}")
    # cloudfront requires escaping the '+' character
    f.write(
-        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
+        template.format(wheel=filename,
-    )
+                        wheel_html_escaped=filename.replace("+", "%2B")))
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@ -1,4 +1,3 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@ -1,4 +1,3 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
@ -1,11 +0,0 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
 model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.335
  - name: "exact_match,flexible-extract"
    value: 0.323
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -1,12 +1,11 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.231
+    value: 0.233
  - name: "exact_match,flexible-extract"
-    value: 0.22
+    value: 0.236
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.30
  - name: "exact_match,flexible-extract"
    value: 0.465
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
@ -1,11 +0,0 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
 model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.54
  - name: "exact_match,flexible-extract"
    value: 0.59
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,11 +0,0 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.47
  - name: "exact_match,flexible-extract"
    value: 0.64
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@ -1,4 +1,3 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
 Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,6 +1,10 @@
-Qwen2.5-1.5B-Instruct.yaml
+Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
-Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+Minitron-4B-Base-FP8.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@ -1,43 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from pathlib import Path
 import pytest
 def pytest_addoption(parser):
    parser.addoption(
        "--config-list-file",
        action="store",
        help="Path to the file listing model config YAMLs (one per line)",
    )
    parser.addoption(
        "--tp-size",
        action="store",
        default="1",
        help="Tensor parallel size to use for evaluation",
    )
@pytest.fixture(scope="session")
 def config_list_file(pytestconfig, config_dir):
    rel_path = pytestconfig.getoption("--config-list-file")
    return config_dir / rel_path
@pytest.fixture(scope="session")
 def tp_size(pytestconfig):
    return pytestconfig.getoption("--tp-size")
 def pytest_generate_tests(metafunc):
    if "config_filename" in metafunc.fixturenames:
        rel_path = metafunc.config.getoption("--config-list-file")
        config_list_file = Path(rel_path).resolve()
        config_dir = config_list_file.parent
        with open(config_list_file, encoding="utf-8") as f:
            configs = [
                config_dir / line.strip()
                for line in f
                if line.strip() and not line.startswith("#")
            ]
        metafunc.parametrize("config_filename", configs)
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -0,0 +1,59 @@
 #!/bin/bash
 usage() {
    echo``
    echo "Runs lm eval harness on GSM8k using vllm and compares to "
    echo "precomputed baseline (measured by HF transformers.)"
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
    echo "  -t    - tensor parallel size"
    echo
 }
 SUCCESS=0
 while getopts "c:t:" OPT; do
  case ${OPT} in
    c ) 
        CONFIG="$OPTARG"
        ;;
    t )
        TP_SIZE="$OPTARG"
        ;;
    \? )
        usage
        exit 1
        ;;
  esac
 done
 # Parse list of configs.
 IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
    LOCAL_SUCCESS=0
    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
    export LM_EVAL_TP_SIZE=$TP_SIZE
    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
    if [[ $LOCAL_SUCCESS == 0 ]]; then
        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
    else
        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
    fi
    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
 done
 if [ "${SUCCESS}" -eq "0" ]; then
    exit 0
 else
    exit 1
 fi
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -3,52 +3,62 @@
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
-pytest -s -v test_lm_eval_correctness.py \
+* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
-    --config-list-file=configs/models-small.txt \
+* export LM_EVAL_TP_SIZE=4 
-    --tp-size=1
+* pytest -s test_lm_eval_correctness.py
 """
 import os
 from pathlib import Path
 import lm_eval
-import numpy as np
+import numpy
 import yaml
-RTOL = 0.08
+RTOL = 0.05
 TEST_DATA_FILE = os.environ.get(
    "LM_EVAL_TEST_DATA_FILE",
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
 TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
-def launch_lm_eval(eval_config, tp_size):
+def launch_lm_eval(eval_config):
-    trust_remote_code = eval_config.get("trust_remote_code", False)
+    trust_remote_code = eval_config.get('trust_remote_code', False)
-    model_args = (
+
-        f"pretrained={eval_config['model_name']},"
+    model_args = f"pretrained={eval_config['model_name']}," \
-        f"tensor_parallel_size={tp_size},"
+                 f"tensor_parallel_size={TP_SIZE}," \
-        f"enforce_eager=true,"
+                 f"add_bos_token=true," \
-        f"add_bos_token=true,"
+                 f"trust_remote_code={trust_remote_code}"
-        f"trust_remote_code={trust_remote_code}"
+
    )
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        batch_size="auto",
+        batch_size="auto")
-    )
+
    return results
-def test_lm_eval_correctness_param(config_filename, tp_size):
+def test_lm_eval_correctness():
-    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+    eval_config = yaml.safe_load(
        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
-    results = launch_lm_eval(eval_config, tp_size)
+    # Launch eval requests.
    results = launch_lm_eval(eval_config)
    # Confirm scores match ground truth.
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
-            print(
+            print(f'{task["name"]} | {metric["name"]}: '
-                f"{task['name']} | {metric['name']}: "
+                  f'ground_truth={ground_truth} | measured={measured_value}')
-                f"ground_truth={ground_truth} | measured={measured_value}"
+            success = success and numpy.isclose(
-            )
+                ground_truth, measured_value, rtol=RTOL)
            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
    # Assert at the end, print all scores even on failure for debugging.
    assert success
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -65,18 +65,18 @@ def read_markdown(file):
 def results_to_json(latency, throughput, serving):
-    return json.dumps(
+    return json.dumps({
-        {
+        'latency': latency.to_dict(),
-            "latency": latency.to_dict(),
+        'throughput': throughput.to_dict(),
-            "throughput": throughput.to_dict(),
+        'serving': serving.to_dict()
-            "serving": serving.to_dict(),
+    })
        }
    )
 if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
@ -120,8 +120,7 @@ if __name__ == "__main__":
            for perc in [10, 25, 50, 75, 90, 99]:
                # Multiply 1000 to convert the time unit from s to ms
                raw_result.update(
-                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
                )
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
            # add the result to raw_result
@ -154,27 +153,26 @@ if __name__ == "__main__":
    serving_results = pd.DataFrame.from_dict(serving_results)
    throughput_results = pd.DataFrame.from_dict(throughput_results)
-    raw_results_json = results_to_json(
+    raw_results_json = results_to_json(latency_results, throughput_results,
-        latency_results, throughput_results, serving_results
+                                       serving_results)
    )
    # remapping the key, for visualization purpose
    if not latency_results.empty:
-        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
+        latency_results = latency_results[list(
-            columns=latency_column_mapping
+            latency_column_mapping.keys())].rename(
-        )
+                columns=latency_column_mapping)
    if not serving_results.empty:
-        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+        serving_results = serving_results[list(
-            columns=serving_column_mapping
+            serving_column_mapping.keys())].rename(
-        )
+                columns=serving_column_mapping)
    if not throughput_results.empty:
-        throughput_results = throughput_results[
+        throughput_results = throughput_results[list(
-            list(throughput_results_column_mapping.keys())
+            throughput_results_column_mapping.keys())].rename(
-        ].rename(columns=throughput_results_column_mapping)
+                columns=throughput_results_column_mapping)
-    processed_results_json = results_to_json(
+    processed_results_json = results_to_json(latency_results,
-        latency_results, throughput_results, serving_results
+                                             throughput_results,
-    )
+                                             serving_results)
    for df in [latency_results, serving_results, throughput_results]:
        if df.empty:
@ -186,39 +184,38 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
        )
    # get markdown tables
-    latency_md_table = tabulate(
+    latency_md_table = tabulate(latency_results,
-        latency_results, headers="keys", tablefmt="pipe", showindex=False
+                                headers='keys',
-    )
+                                tablefmt='pipe',
-    serving_md_table = tabulate(
+                                showindex=False)
-        serving_results, headers="keys", tablefmt="pipe", showindex=False
+    serving_md_table = tabulate(serving_results,
-    )
+                                headers='keys',
-    throughput_md_table = tabulate(
+                                tablefmt='pipe',
-        throughput_results, headers="keys", tablefmt="pipe", showindex=False
+                                showindex=False)
-    )
+    throughput_md_table = tabulate(throughput_results,
                                   headers='keys',
                                   tablefmt='pipe',
                                   showindex=False)
    # document the result
    with open(results_folder / "benchmark_results.md", "w") as f:
-        results = read_markdown(
+
-            "../.buildkite/nightly-benchmarks/"
+        results = read_markdown("../.buildkite/nightly-benchmarks/" +
-            + "performance-benchmarks-descriptions.md"
+                                "performance-benchmarks-descriptions.md")
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
            serving_tests_markdown_table=serving_md_table,
-            benchmarking_results_in_json_string=processed_results_json,
+            benchmarking_results_in_json_string=processed_results_json)
        )
        f.write(results)
    # document benchmarking results in json
    with open(results_folder / "benchmark_results.json", "w") as f:
-        results = (
+
-            latency_results.to_dict(orient="records")
+        results = latency_results.to_dict(
-            + throughput_results.to_dict(orient="records")
+            orient='records') + throughput_results.to_dict(
-            + serving_results.to_dict(orient="records")
+                orient='records') + serving_results.to_dict(orient='records')
        )
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -14,12 +14,15 @@ def main(model, cachedir):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description="Download and save Hugging Face tokenizer"
+        description="Download and save Hugging Face tokenizer")
-    )
+    parser.add_argument("--model",
-    parser.add_argument("--model", type=str, required=True, help="Name of the model")
+                        type=str,
-    parser.add_argument(
+                        required=True,
-        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
+                        help="Name of the model")
-    )
+    parser.add_argument("--cachedir",
                        type=str,
                        required=True,
                        help="Directory to save the tokenizer")
    args = parser.parse_args()
    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -11,33 +11,33 @@ from tabulate import tabulate
 def parse_arguments():
    parser = argparse.ArgumentParser(
-        description="Parse command line arguments for summary-nightly-results script."
+        description=
-    )
+        'Parse command line arguments for summary-nightly-results script.')
-    parser.add_argument(
+    parser.add_argument('--results-folder',
-        "--results-folder",
+                        type=str,
-        type=str,
+                        required=True,
-        required=True,
+                        help='The folder where the results are stored.')
-        help="The folder where the results are stored.",
+    parser.add_argument('--description',
-    )
+                        type=str,
-    parser.add_argument(
+                        required=True,
-        "--description", type=str, required=True, help="Description of the results."
+                        help='Description of the results.')
    )
    args = parser.parse_args()
    return args
 def get_perf(df, method, model, metric):
    means = []
    for qps in [2, 4, 8, 16, "inf"]:
-        target = df["Test name"].str.contains(model)
+        target = df['Test name'].str.contains(model)
-        target = target & df["Engine"].str.contains(method)
+        target = target & df['Engine'].str.contains(method)
-        target = target & df["Test name"].str.contains("qps_" + str(qps))
+        target = target & df['Test name'].str.contains("qps_" + str(qps))
        filtered_df = df[target]
        if filtered_df.empty:
-            means.append(0.0)
+            means.append(0.)
        else:
            means.append(filtered_df[metric].values[0])
@ -45,6 +45,7 @@ def get_perf(df, method, model, metric):
 def get_perf_w_std(df, method, model, metric):
    if metric in ["TTFT", "ITL"]:
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
        mean = mean.tolist()
@ -59,8 +60,7 @@ def get_perf_w_std(df, method, model, metric):
    else:
        assert metric == "Tput"
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)"
+            df, method, model, "Output Tput (tok/s)")
        )
        mean = mean.tolist()
        std = None
@ -80,17 +80,18 @@ def main(args):
    # generate markdown table
    df = pd.DataFrame.from_dict(results)
-    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
    with open(args.description) as f:
        description = f.read()
-    description = description.format(nightly_results_benchmarking_table=md_table)
+    description = description.format(
        nightly_results_benchmarking_table=md_table)
    with open("nightly_results.md", "w") as f:
        f.write(description)
-if __name__ == "__main__":
+if __name__ == '__main__':
    args = parse_arguments()
    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -426,7 +426,7 @@ main() {
  pip install -U transformers
-  pip install -r requirements/dev.txt
+  pip install -r requirements-dev.txt
  which genai-perf
  # check storage
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -10,24 +10,15 @@ set -x
 set -o pipefail
 check_gpus() {
-  if command -v nvidia-smi; then
+  # check the number of GPUs and GPU type.
-    # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
  fi
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  if command -v nvidia-smi; then
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
  fi
  echo "GPU type is $gpu_type"
 }
@ -99,15 +90,9 @@ kill_gpu_processes() {
  # wait until GPU memory usage smaller than 1GB
-  if command -v nvidia-smi; then
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
-      sleep 1
+  done
    done
  elif command -v amd-smi; then
    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
      sleep 1
    done
  fi
  # remove vllm config file
  rm -rf ~/.config/vllm
@ -376,7 +361,7 @@ main() {
  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOGGING_LEVEL="WARNING"
+  export VLLM_LOG_LEVEL="WARNING"
  # prepare for benchmarking
  cd benchmarks || exit 1
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -34,8 +34,10 @@ serving_column_mapping = {
 }
 if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
@ -54,16 +56,17 @@ if __name__ == "__main__":
    serving_results = pd.DataFrame.from_dict(serving_results)
    if not serving_results.empty:
-        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+        serving_results = serving_results[list(
-            columns=serving_column_mapping
+            serving_column_mapping.keys())].rename(
-        )
+                columns=serving_column_mapping)
-    serving_md_table_with_headers = tabulate(
+    serving_md_table_with_headers = tabulate(serving_results,
-        serving_results, headers="keys", tablefmt="pipe", showindex=False
+                                             headers='keys',
-    )
+                                             tablefmt='pipe',
                                             showindex=False)
    # remove the first line of header
-    serving_md_table_lines = serving_md_table_with_headers.split("\n")
+    serving_md_table_lines = serving_md_table_with_headers.split('\n')
-    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
+    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
@ -73,9 +76,10 @@ if __name__ == "__main__":
        # document results with header.
        # for those who wants to reproduce our benchmark.
        f.write(serving_md_table_with_headers)
-        f.write("\n")
+        f.write('\n')
    # document benchmarking results in json
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
-        results = serving_results.to_dict(orient="records")
+
        results = serving_results.to_dict(orient='records')
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -63,12 +63,10 @@
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
-            "swap_space": 16,
+            "swap_space": 16, 
-            "speculative_config": {
+            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
-                "model": "turboderp/Qwama-0.5B-Instruct",
+            "num_speculative_tokens": 4,
-                "num_speculative_tokens": 4,
+            "speculative_draft_tensor_parallel_size": 1
                "draft_tensor_parallel_size": 1
            }
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@ -1,46 +0,0 @@
 # This local pyproject file is part of the migration from yapf to ruff format.
 # It uses the same core rules as the main pyproject.toml file, but with the
 # following differences:
 # - ruff line length is overridden to 88
 # - deprecated typing ignores (UP006, UP035) have been removed
 [tool.ruff]
 line-length = 88
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 [tool.ruff.lint]
 select = [
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # pyupgrade
    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
    "SIM",
    # isort
    "I",
    # flake8-logging-format
    "G",
 ]
 ignore = [
    # star imports
    "F405", "F403",
    # lambda expression assignment
    "E731",
    # Loop control variable not used within loop body
    "B007",
    # f-string format
    "UP032",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
 ]
 [tool.ruff.format]
 docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,23 +1,23 @@
 steps:
-  - label: "Build wheel - CUDA 12.8"
+  - label: "Build wheel - CUDA 12.4"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
-  - label: "Build wheel - CUDA 12.6"
+  - label: "Build wheel - CUDA 12.1"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
@ -31,10 +31,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
@ -48,7 +48,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Build and publish TPU release image"
@ -57,14 +57,12 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
-      - "yes | docker system prune -a"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
      - "git fetch --all"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
      - docker-login#v3.0.0:
-          username: vllmbot
+          username: vllm
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
@ -84,22 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
  - block: "Build Neuron release image"
    key: block-neuron-release-image-build
    depends_on: ~
  - label: "Build and publish Neuron release image"
    depends_on: block-neuron-release-image-build
    agents:
      queue: neuron-postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -0,0 +1,162 @@
 #!/bin/bash
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- ROCm info"
 rocminfo
 # cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
  if [ -z "$docker_root" ]; then
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
 }
 # Call the cleanup docker function
 cleanup_docker
 echo "--- Resetting GPUs"
 echo "reset" > /opt/amdgpu/etc/gpu_state
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- Pulling container" 
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 remove_docker_container() {
   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT
 echo "--- Running container"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 #ignore certain kernels tests
 if [[ $commands == *" kernels "* ]]; then
  commands="${commands} \
  --ignore=kernels/test_attention.py \
  --ignore=kernels/test_attention_selector.py \
  --ignore=kernels/test_blocksparse_attention.py \
  --ignore=kernels/test_causal_conv1d.py \
  --ignore=kernels/test_cutlass.py \
  --ignore=kernels/test_encoder_decoder_attn.py \
  --ignore=kernels/test_flash_attn.py \
  --ignore=kernels/test_flashinfer.py \
  --ignore=kernels/test_int8_quant.py \
  --ignore=kernels/test_machete_gemm.py \
  --ignore=kernels/test_mamba_ssm.py \
  --ignore=kernels/test_marlin_gemm.py \
  --ignore=kernels/test_moe.py \
  --ignore=kernels/test_prefix_prefill.py \
  --ignore=kernels/test_rand.py \
  --ignore=kernels/test_sampler.py \
  --ignore=kernels/test_cascade_flash_attn.py \
  --ignore=kernels/test_mamba_mixer2.py"
 fi
 #ignore certain Entrypoints tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_accuracy.py \
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_encoder_decoder.py \
  --ignore=entrypoints/openai/test_embedding.py \
  --ignore=entrypoints/openai/test_oot_registration.py "}
 fi
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
  # assign job count as the number of shards used   
  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
    # assign shard-id for each shard
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    echo "Shard ${GPU} commands:$commands_gpu"
    docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
        --shm-size=16gb \
        --rm \
        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
        -e AWS_ACCESS_KEY_ID \
        -e AWS_SECRET_ACCESS_KEY \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
        --name "${container_name}_${GPU}" \
        "${image_name}" \
        /bin/bash -c "${commands_gpu}" \
        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
    PIDS+=($!)
  done
  #wait for all processes to finish and collect exit codes
  for pid in "${PIDS[@]}"; do
    wait "${pid}"
    STATUS+=($?)
  done
  for st in "${STATUS[@]}"; do
    if [[ ${st} -ne 0 ]]; then
      echo "One of the processes failed with $st"
      exit "${st}"
    fi
  done
 else
  docker run \
          --device /dev/kfd --device /dev/dri \
          --network host \
          --shm-size=16gb \
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
          -e HF_TOKEN \
          -e AWS_ACCESS_KEY_ID \
          -e AWS_SECRET_ACCESS_KEY \
          -v "${HF_CACHE}:${HF_MOUNT}" \
          -e "HF_HOME=${HF_MOUNT}" \
          --name "${container_name}" \
          "${image_name}" \
          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -5,8 +5,8 @@
 set -ex
 set -o pipefail
-# cd 2 levels into the working directory
+# cd into parent directory of this file
-cd "$(dirname "${BASH_SOURCE[0]}")/../.."
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
-docker build -t cpu-test -f docker/Dockerfile.s390x .
+docker build -t cpu-test -f Dockerfile.ppc64le .
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -8,29 +8,24 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 # Try building the docker image
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 # Setup cleanup
-remove_docker_container() { 
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
    set -e; 
    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
  export BUILDKITE_BUILD_NUMBER=$3
  # offline inference
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
@ -40,8 +35,7 @@ function cpu_tests() {
  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/test_cache.py -m cpu_model
+    pip install -r vllm/requirements-test.txt
    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
    pytest -v -s tests/models/decoder_only/language -m cpu_model
    pytest -v -s tests/models/embedding/language -m cpu_model
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
@ -91,4 +85,4 @@ function cpu_tests() {
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@ -9,13 +9,11 @@ python3 use_existing_torch.py
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
  --build-arg RUN_WHEEL_CHECK=false \
  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"
@ -25,6 +23,6 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and test offline inference
-docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -5,22 +5,20 @@
 set -ex
 # Try building the docker image
-docker build -t hpu-test-env -f docker/Dockerfile.hpu .
+docker build -t hpu-test-env -f Dockerfile.hpu .
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
 # override the exit code of the script, so we need to use
-# separate remove_docker_containers and remove_docker_containers_and_exit
+# separate remove_docker_container and remove_docker_container_and_exit
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
+remove_docker_container() { docker rm -f hpu-test || true; }
-remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
+remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
-trap remove_docker_containers_and_exit EXIT
+trap remove_docker_container_and_exit EXIT
-remove_docker_containers
+remove_docker_container
 # Run the image and launch offline inference
 docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
 EXITCODE=$?
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@ -3,7 +3,7 @@
 set -euox pipefail
 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
 NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 # Try building the docker image
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
+aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 # prune old image and containers to save disk space, and only once a day
 # by using a timestamp file in tmp.
@ -36,7 +35,7 @@ else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .
 # Setup cleanup
 remove_docker_container() {
@ -45,19 +44,11 @@ remove_docker_container() {
 trap remove_docker_container EXIT
 # Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
+docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
       -v "${HF_CACHE}:${HF_MOUNT}" \
       -e "HF_HOME=${HF_MOUNT}" \
       -e "HF_TOKEN=${HF_TOKEN}" \
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
-       /bin/bash -c "
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
            python3 /workspace/vllm/examples/offline_inference/neuron.py;
            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
                echo 'Running test file: '$f;
                python3 -m pytest \$f -v --capture=tee-sys;
            done
       "
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -0,0 +1,16 @@
 #!/bin/bash
 # This script build the OpenVINO docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t openvino-test -f Dockerfile.openvino .
 # Setup cleanup
 remove_docker_container() { docker rm -f openvino-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and launch offline inference
 docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -0,0 +1,26 @@
 #!/bin/bash
 set -e
 # Build the docker image.
 docker build -f Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -0,0 +1,19 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t xpu-test -f Dockerfile.xpu .
 # Setup cleanup
 remove_docker_container() { docker rm -f xpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -1,241 +0,0 @@
 #!/bin/bash
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail
 # Export Python path
 export PYTHONPATH=".."
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- ROCm info"
 rocminfo
 # cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
  if [ -z "$docker_root" ]; then
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
 }
 # Call the cleanup docker function
 cleanup_docker
 echo "--- Resetting GPUs"
 echo "reset" > /opt/amdgpu/etc/gpu_state
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- Pulling container" 
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 remove_docker_container() {
   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT
 echo "--- Running container"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
 fi
 if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
 fi
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \
  --ignore=kernels/core/test_fused_quant_layernorm.py \
  --ignore=kernels/core/test_permute_cols.py"
 fi
 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
  --ignore=kernels/attention/stest_attention_selector.py \
  --ignore=kernels/attention/test_blocksparse_attention.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
  --ignore=kernels/attention/test_cascade_flash_attn.py \
  --ignore=kernels/attention/test_mha_attn.py \
  --ignore=kernels/attention/test_lightning_attn.py \
  --ignore=kernels/attention/test_attention.py"
 fi
 if [[ $commands == *" kernels/quantization"* ]]; then
  commands="${commands} \
  --ignore=kernels/quantization/test_int8_quant.py \
  --ignore=kernels/quantization/test_aqlm.py \
  --ignore=kernels/quantization/test_machete_mm.py \
  --ignore=kernels/quantization/test_block_fp8.py \
  --ignore=kernels/quantization/test_block_int8.py \
  --ignore=kernels/quantization/test_marlin_gemm.py \
  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
  --ignore=kernels/quantization/test_int8_kernel.py"
 fi
 if [[ $commands == *" kernels/mamba"* ]]; then
  commands="${commands} \
  --ignore=kernels/mamba/test_mamba_mixer2.py \
  --ignore=kernels/mamba/test_causal_conv1d.py \
  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
 fi
 if [[ $commands == *" kernels/moe"* ]]; then
  commands="${commands} \
  --ignore=kernels/moe/test_moe.py \
  --ignore=kernels/moe/test_cutlass_moe.py \
  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi
 #ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
  --ignore=entrypoints/openai/test_root_path.py \
  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 #ignore certain Entrypoints/llm tests
 if [[ $commands == *" entrypoints/llm "* ]]; then
  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 #Obsolete currently
 ##ignore certain Entrypoints/llm tests
 #if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
 #  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 #fi
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
 # --ignore=entrypoints/openai/test_accuracy.py \
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
 PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
  # assign job count as the number of shards used   
  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
    # assign shard-id for each shard
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    echo "Shard ${GPU} commands:$commands_gpu"
    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
    docker run \
        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
        --network=host \
        --shm-size=16gb \
        --rm \
        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
        -e AWS_ACCESS_KEY_ID \
        -e AWS_SECRET_ACCESS_KEY \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
        -e "PYTHONPATH=${MYPYTHONPATH}" \
        --name "${container_name}_${GPU}" \
        "${image_name}" \
        /bin/bash -c "${commands_gpu}" \
        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
    PIDS+=($!)
  done
  #wait for all processes to finish and collect exit codes
  for pid in "${PIDS[@]}"; do
    wait "${pid}"
    STATUS+=($?)
  done
  for st in "${STATUS[@]}"; do
    if [[ ${st} -ne 0 ]]; then
      echo "One of the processes failed with $st"
      exit "${st}"
    fi
  done
 else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
          --network=host \
          --shm-size=16gb \
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
          -e HF_TOKEN \
          -e AWS_ACCESS_KEY_ID \
          -e AWS_SECRET_ACCESS_KEY \
          -v "${HF_CACHE}:${HF_MOUNT}" \
          -e "HF_HOME=${HF_MOUNT}" \
          -e "PYTHONPATH=${MYPYTHONPATH}" \
          --name "${container_name}" \
          "${image_name}" \
          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -1,48 +0,0 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Setup cleanup
 remove_docker_container() {
  if [[ -n "$container_id" ]]; then
      podman rm -f "$container_id" || true
  fi
  podman system prune -f
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
 # Run the image
 container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
 function cpu_tests() {
  # offline inference
  podman exec -it "$container_id" bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run basic model test
  podman exec -it "$container_id" bash -c "
    set -e
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
    pip install sentence-transformers datamodel_code_generator
    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export container_id
 export -f cpu_tests
 timeout 40m bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -1,103 +0,0 @@
 #!/bin/bash
 set -xu
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest pytest-asyncio tpu-info \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && export VLLM_XLA_CACHE_PATH= \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
    && echo HARDWARE \
    && tpu-info \
    && { \
        echo TEST_0: Running test_perf.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
        echo TEST_0_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_1: Running test_compilation.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
        echo TEST_1_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_2: Running test_basic.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
        echo TEST_2_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
        python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
        echo TEST_3_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_4: Running test_quantization_accuracy.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
        echo TEST_4_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_5: Running examples/offline_inference/tpu.py; \
        python3 /workspace/vllm/examples/offline_inference/tpu.py; \
        echo TEST_5_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_6: Running test_tpu_model_runner.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
        echo TEST_6_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_7: Running test_sampler.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
        echo TEST_7_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_8: Running test_topk_topp_sampler.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
        echo TEST_8_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_9: Running test_multimodal.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
        echo TEST_9_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_10: Running test_pallas.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
        echo TEST_10_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_11: Running test_struct_output_generate.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
        echo TEST_11_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_12: Running test_moe_pallas.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
        echo TEST_12_EXIT_CODE: \$?; \
    } & \
    # Disable the TPU LoRA tests until the feature is activated
    # & { \
    #     echo TEST_13: Running test_moe_pallas.py; \
    #     python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
    #     echo TEST_13_EXIT_CODE: \$?; \
    # } & \
    wait \
    && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
 "
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -1,31 +0,0 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # Try building the docker image
 docker build -t ${image_name} -f docker/Dockerfile.xpu .
 # Setup cleanup
 remove_docker_container() { 
  docker rm -f "${container_name}" || true; 
  docker image rm -f "${image_name}" || true;
  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
 # Run the image and test offline inference/tensor parallel
 docker run \
    --device /dev/dri \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
    sh -c '
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -8,7 +8,6 @@
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
@ -32,17 +31,17 @@ steps:
 ##### fast check tests  #####
 - label: Documentation Build # 2min
-  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/test_docs/docs"
  working_dir: "/vllm-workspace/test_docs"
  fast_check: true
  no_gpu: True
  commands:
-  - pip install -r ../requirements/docs.txt
+  - pip install -r requirements-docs.txt
-  # TODO: add `--strict` once warnings in docstrings are fixed
+  - SPHINXOPTS=\"-W\" make html
-  - mkdocs build
+  # Check API reference (if it fails, you may have missing mock imports)
  - grep \"sig sig-object py\" build/html/api/inference_params.html
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
-  mirror_hardwares: [amdexperimental]
+  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
@ -58,13 +57,11 @@ steps:
  - pytest -v -s async_engine # AsyncLLMEngine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s multimodal
  - pytest -v -s test_utils.py # Utils
  - pytest -v -s worker # Worker
 - label: Python-only Installation Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
  - setup.py
@ -72,9 +69,8 @@ steps:
  - bash standalone_tests/python_only_compile.sh
 - label: Basic Correctness Test # 30min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  #mirror_hardwares: [amd]
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
@ -82,14 +78,12 @@ steps:
  - tests/basic_correctness/test_preemption
  - tests/basic_correctness/test_cumem.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 - label: Chunked Prefill Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_chunked_prefill
@ -98,7 +92,7 @@ steps:
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 - label: Core Test # 10min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amd]
  fast_check: true
  source_file_dependencies:
  - vllm/core
@ -108,10 +102,9 @@ steps:
  - pytest -v -s core
 - label: Entrypoints Test # 40min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
-  torch_nightly: true
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
@ -119,69 +112,61 @@ steps:
  - tests/entrypoints/test_chat_utils
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Distributed Tests (4 GPUs) # 10min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  fast_check: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
  - tests/spec_decode/e2e/test_integration_dist_tp4
  - tests/compile/test_basic_correctness
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/test_async_llm_dp.py
  commands:
-  # test with tp=2 and external_dp=2
+  - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
-  - pushd ../examples/offline_inference
+  - python3 ../examples/offline_inference/rlhf.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
 - label: Metrics, Tracing Test # 10min
  mirror_hardwares: [amdexperimental, amdproduction]
  num_gpus: 2
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/metrics
  - tests/tracing
  commands:
  - pytest -v -s metrics
  - "pip install \
      'opentelemetry-sdk>=1.26.0,<1.27.0' \
      'opentelemetry-api>=1.26.0,<1.27.0' \
      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
  - pytest -v -s tracing
 ##### fast check tests  #####
 #####  1 GPU test  #####
 - label: Regression Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@ -191,7 +176,7 @@ steps:
  working_dir: "/vllm-workspace/tests" # optional
 - label: Engine Test # 10min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/engine
@ -205,34 +190,28 @@ steps:
  - pytest -v -s tokenization
 - label: V1 Test
-  mirror_hardwares: [amdexperimental]
+  #mirror_hardwares: [amd]
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    # split the test to avoid interference
-    - pytest -v -s v1/core
+    - VLLM_USE_V1=1 pytest -v -s v1/core
-    - pytest -v -s v1/engine
+    - VLLM_USE_V1=1 pytest -v -s v1/engine
-    - pytest -v -s v1/entrypoints
+    - VLLM_USE_V1=1 pytest -v -s v1/sample
-    - pytest -v -s v1/sample
+    - VLLM_USE_V1=1 pytest -v -s v1/worker
-    - pytest -v -s v1/worker
+    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
-    - pytest -v -s v1/structured_output
+    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
    - pytest -v -s v1/spec_decode
    - pytest -v -s v1/kv_connector/unit
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_metrics_reader.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
+    - VLLM_USE_V1=1 pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 - label: Examples Test # 25min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/examples"
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/entrypoints
  - examples/
@ -243,20 +222,17 @@ steps:
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py
-    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py
-    - python3 offline_inference/vision_language_embedding.py --seed 0
+    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 - label: Prefix Caching Test # 9min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/prefix_caching
@ -264,7 +240,6 @@ steps:
    - pytest -v -s prefix_caching
 - label: Samplers Test # 36min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
@ -275,7 +250,7 @@ steps:
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 - label: LogitsProcessor Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/model_executor/guided_decoding
@ -286,7 +261,6 @@ steps:
    - pytest -v -s model_executor/test_guided_processors.py
 - label: Speculative decoding tests # 40min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/spec_decode
  - tests/spec_decode
@ -297,29 +271,15 @@ steps:
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
  parallelism: 4
 - label: PyTorch Compilation Unit Tests
  mirror_hardwares: [amdexperimental, amdproduction]
  torch_nightly: true
  source_file_dependencies:
    - vllm/
    - tests/compile
  commands:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
    - pytest -v -s compile/test_sequence_parallelism.py
    - pytest -v -s compile/test_async_tp.py
 - label: PyTorch Fullgraph Smoke Test # 9min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@ -330,110 +290,58 @@ steps:
  - pytest -v -s compile/piecewise/test_toy_llama.py
 - label: PyTorch Fullgraph Test # 18min
  mirror_hardwares: [amdexperimental, amdproduction]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
- label: Kernels Core Operation Test
+- label: Kernels Test %N # 1h each
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  commands:
    - pytest -v -s kernels/core
 - label: Kernels Attention Test %N
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
-  - vllm/v1/attention
+  - tests/kernels
  - tests/kernels/attention
  commands:
-    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
+  parallelism: 4
 - label: Kernels Quantization Test %N
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization
  commands:
    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels MoE Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  commands:
    - pytest -v -s kernels/moe
 - label: Kernels Mamba Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
  commands:
    - pytest -v -s kernels/mamba
 - label: Tensorizer Test # 11min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amd]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
  - tests/tensorizer_loader
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s tensorizer_loader
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 - label: Benchmarks # 9min
  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/.buildkite"
  mirror_hardwares: [amd]
  source_file_dependencies:
  - benchmarks/
  commands:
-  - bash scripts/run-benchmarks.sh
+  - bash run-benchmarks.sh
- label: Benchmarks CLI Test # 10min
+- label: Quantization Test # 33min
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
 - label: Quantization Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
-  commands:
+  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 - label: LM Eval Small Models # 53min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1
 - label: OpenAI API correctness
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
@ -442,7 +350,6 @@ steps:
  - pytest -s entrypoints/openai/correctness/
 - label: Encoder Decoder tests # 5min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/encoder_decoder
@ -450,117 +357,97 @@ steps:
    - pytest -v -s encoder_decoder
 - label: OpenAI-Compatible Tool Use # 20 min
  mirror_hardwares: [amdexperimental]
  fast_check: false
  mirror_hardwares: [ amd ]
  source_file_dependencies:
    - vllm/
    - tests/tool_use
    - tests/mistral_tool_use
  commands:
    - pytest -v -s tool_use
    - pytest -v -s mistral_tool_use
 #####  models test  #####
 - label: Basic Models Test # 24min
  mirror_hardwares: [amdexperimental, amdproduction]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
    - pytest -v -s models/test_utils.py
    - pytest -v -s models/test_vision.py
    - pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard)
+- label: Language Models Test (Standard) # 32min
-  mirror_hardwares: [amdexperimental]
+  #mirror_hardwares: [amd]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
-  - tests/models/language
+  - tests/models/decoder_only/language
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/embedding/language -m core_model
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/language -m core_model
- label: Language Models Test (Extended Generation) # 1hr20min
+- label: Language Models Test (Extended) # 1h10min
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/language/generation
+  - tests/models/decoder_only/language
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/embedding/language -m 'not core_model'
    - pytest -v -s models/language/generation -m 'not core_model'
- label: Language Models Test (Extended Pooling)  # 36min
+- label: Multi-Modal Models Test (Standard) # 40min
-  mirror_hardwares: [amdexperimental]
+  #mirror_hardwares: [amd]
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/language/pooling
+  - tests/models/decoder_only/audio_language
-  commands:
+  - tests/models/decoder_only/vision_language
-    - pytest -v -s models/language/pooling -m 'not core_model'
+  - tests/models/embedding/vision_language
-
+  - tests/models/encoder_decoder/audio_language
- label: Multi-Modal Models Test (Standard)
+  - tests/models/encoder_decoder/vision_language
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
-    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+    - pytest -v -s models/embedding/vision_language -m core_model
    - pytest -v -s models/encoder_decoder/audio_language -m core_model
    - pytest -v -s models/encoder_decoder/language -m core_model
    - pytest -v -s models/encoder_decoder/vision_language -m core_model
- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 48m
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/multimodal
+  - tests/models/decoder_only/audio_language
  - tests/models/decoder_only/vision_language
  - tests/models/embedding/vision_language
  - tests/models/encoder_decoder/vision_language
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
    # HACK - run phi3v tests separately to sidestep this transformers bug
    # https://github.com/huggingface/transformers/issues/34307
    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/vision_language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
- label: Multi-Modal Models Test (Extended) 2
+- label: Multi-Modal Models Test (Extended) 2 # 38m
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/multimodal
+  - tests/models/decoder_only/vision_language
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
 - label: Multi-Modal Models Test (Extended) 3
  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 - label: Quantized Models Test
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
  - tests/models/quantization
  commands:
    - pytest -v -s models/quantization
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true
  commands:
    - echo 'Testing custom models...'
@ -572,7 +459,6 @@ steps:
 #####  multi gpus test  #####
 - label: Distributed Comm Ops Test # 7min
  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -583,7 +469,6 @@ steps:
  - pytest -v -s distributed/test_shm_broadcast.py
 - label: 2 Node Tests (4 GPUs in total) # 16min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  num_nodes: 2
@ -602,7 +487,7 @@ steps:
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 - label: Distributed Tests (2 GPUs) # 40min
-  mirror_hardwares: [amdexperimental]
+  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -616,37 +501,34 @@ steps:
  - vllm/worker/worker.py
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
  - tests/v1/test_async_llm_dp.py
  - vllm/v1/engine/
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
-  # test sequence parallel
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 - label: Plugin Tests (2 GPUs) # 40min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  fast_check: true
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
  commands:
-  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  # begin platform plugin tests, all the code in-between runs on dummy platform
  - pip install -e ./plugins/vllm_add_dummy_platform
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
@ -657,10 +539,8 @@ steps:
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 - label: Multi-step Tests (4 GPUs) # 36min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -681,7 +561,6 @@ steps:
  - pytest -v -s multi_step/test_correctness_llm.py
 - label: Pipeline Parallelism Test # 45min
  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -695,7 +574,6 @@ steps:
  - pytest -v -s distributed/test_pipeline_parallel.py
 - label: LoRA TP Test (Distributed)
  mirror_hardwares: [amdexperimental, amdproduction]
  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
@ -704,14 +582,16 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # This test runs llama 13B, so it is required to run on 4 GPUs.
    - pytest -v -s -x lora/test_long_context.py
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_minicpmv_tp.py
 - label: Weight Loading Multiple GPU Test  # 33min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -721,7 +601,6 @@ steps:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 - label: Weight Loading Multiple GPU Test - Large Models # optional
  mirror_hardwares: [amdexperimental] 
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
@ -760,4 +639,4 @@ steps:
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
+    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
+    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu124 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@ -66,13 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
+    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
+    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu124 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -10,39 +10,27 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb
+/vllm/model_executor/guided_decoding @mgoin
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 CMakeLists.txt @tlrmchlsmth
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
+/tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/quantization @mgoin @robertgshaw2-redhat
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 /tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
 /tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
 /tests/v1/structured_output @mgoin @russellb
 /tests/weight_loading @mgoin @youkaichao
-/tests/lora @jeejeelee
+/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 # Docs
 /docs @hmellor
 mkdocs.yaml @hmellor
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -14,19 +14,19 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      <details>
-      <summary>The output of <code>python collect_env.py</code></summary>
+      <summary>The output of `python collect_env.py`</summary>
      ```text
      Your output of `python collect_env.py` here
      ```
-
+      
      </details>
  validations:
    required: true
@ -75,20 +75,20 @@ body:
      ```
      ```
-      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
+      The error message you got, with the full traceback.
      ```
  validations:
    required: true
 - type: markdown
  attributes:
-    value: |
+    value: >
-      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
+      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
-      Thanks for reporting 🙏!
+      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
--- a/.github/ISSUE_TEMPLATE/450-ci-failure.yml
+++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml
@ -1,69 +0,0 @@
 name: 🧪 CI failure report
 description: Report a failing test.
 title: "[CI Failure]: "
 labels: ["ci-failure"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Include the name of the failing Buildkite step and test file in the title.
 - type: input
  attributes:
    label: Name of failing test
    description: |
      Paste in the fully-qualified name of the failing test from the logs.
    placeholder: |
      `path/to/test_file.py::test_name[params]`
  validations:
    required: true
 - type: checkboxes
  attributes:
    label: Basic information
    description: Select all items that apply to the failing test.
    options:
      - label: Flaky test
      - label: Can reproduce locally
      - label: Caused by external libraries (e.g. bug in `transformers`)
 - type: textarea
  attributes:
    label: 🧪 Describe the failing test
    description: |
      Please provide a clear and concise description of the failing test.
    placeholder: |
      A clear and concise description of the failing test.
      ```
      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
      ```
  validations:
    required: true
 - type: textarea
  attributes:
    label: 📝 History of failing test
    description: |
      Since when did the test start to fail?
      You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
      If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
      - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
      - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
      - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
    placeholder: |
      Approximate timeline and/or problematic PRs
      A link to the Buildkite analytics of the failing test (if available)
  validations:
    required: true
 - type: textarea
  attributes:
    label: CC List.
    description: >
      The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
 - type: markdown
  attributes:
    value: >
      Thanks for reporting 🙏!
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@ -35,7 +35,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
@ -0,0 +1,28 @@
 name: 🎲 Misc/random discussions that do not fit into the above categories.
 description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 title: "[Misc]: "
 labels: ["misc"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 - type: textarea
  attributes:
    label: Anything you want to discuss about vllm.
    description: >
      Anything you want to discuss about vllm.
  validations:
    required: true
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,5 +1 @@
 blank_issues_enabled: false
 contact_links:
  - name: Questions
    url: https://discuss.vllm.ai
    about: Ask questions and discuss with other vLLM community members
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)
 <!--- pyml disable-next-line no-emphasis-as-heading -->
-**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -23,7 +23,7 @@ updates:
      - dependency-name: "lm-format-enforcer"
      - dependency-name: "gguf"
      - dependency-name: "compressed-tensors"
-      - dependency-name: "ray[cgraph]" # Ray Compiled Graph
+      - dependency-name: "ray[adag]"
      - dependency-name: "lm-eval"
    groups:
      minor-update:
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -19,7 +19,7 @@ pull_request_rules:
      - files~=\.buildkite/
      - files~=^cmake/
      - files=CMakeLists.txt
-      - files~=^docker/Dockerfile
+      - files~=^Dockerfile
      - files~=^requirements.*\.txt
      - files=setup.py
  actions:
@ -36,38 +36,15 @@ pull_request_rules:
      add:
        - frontend
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
      - files~=^tests/models/multimodal/
      - files~=^tests/models/*/audio_language/
      - files~=^tests/models/*/vision_language/
      - files=tests/models/test_vision.py
  actions:
    label:
      add:
        - multi-modality
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
      - files=benchmarks/run_structured_output_benchmark.sh
      - files=docs/features/structured_outputs.md
      - files=examples/offline_inference/structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^vllm/model_executor/guided_decoding/
      - files=tests/model_executor/test_guided_processors.py
      - files=tests/entrypoints/llm/test_guided_generate.py
-      - files~=^tests/v1/structured_output/
+      - files=benchmarks/benchmark_serving_guided.py
-      - files=tests/v1/entrypoints/llm/test_guided_generate.py
+      - files=benchmarks/benchmark_guided.py
      - files~=^vllm/v1/structured_output/
  actions:
    label:
      add:
@ -96,56 +73,6 @@ pull_request_rules:
      add:
        - v1
 - name: label-tpu
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
    - or:
      - files~=tpu.py
      - files~=_tpu
      - files~=tpu_
      - files~=/tpu/
      - files~=pallas
  actions:
    label:
      add:
        - tpu
 - name: label-tpu-remove
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
    - and:
      - -files~=tpu.py
      - -files~=_tpu
      - -files~=tpu_
      - -files~=/tpu/
      - -files~=pallas
  actions:
    label:
      remove:
        - tpu
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/mistral_tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
      - files=examples/offline_inference/chat_with_tools.py
      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
  actions:
    label:
      add:
        - tool-calling
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
      - conflict
@ -161,17 +88,6 @@ pull_request_rules:
       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
 - name: assign reviewer for tensorizer changes
  conditions:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
      - files~=^tests/tensorizer_loader/
  actions:
    assign:
      users:
        - "sangstar"
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
      - -conflict
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
 # Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
 python3 - <<EOF
-import regex as re
+import re
 with open("${NEW}", "r") as file:
    content = file.read()
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -1,6 +1,4 @@
 name: Add label on auto-merge enabled
 permissions:
    pull-requests: write
 on:
    pull_request_target:
        types:
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -20,12 +20,7 @@ jobs:
        with:
          python-version: '3.12'
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip
          python3 -m pip install regex
      - name: Update PR description
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
+        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -2,9 +2,6 @@ name: Lint and Deploy Charts
 on: pull_request
 permissions:
  contents: read
 jobs:
  lint-and-deploy:
    runs-on: ubuntu-latest
@ -53,7 +50,7 @@ jobs:
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
@ -69,7 +66,7 @@ jobs:
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
-
+    
      - name: curl test
        run: |
          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
@ -82,4 +79,4 @@ jobs:
                          "max_tokens": 7,
                          "temperature": 0
                  }'):$CODE"
-          echo "$CODE"
+          echo "$CODE"
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -5,9 +5,6 @@ on:
  push:
    branches: [main]
 permissions:
  contents: read
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -39,7 +39,7 @@ jobs:
            const script = require('.github/workflows/scripts/create_release.js')
            await script(github, context, core)
-  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
+  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
  # wheel:
  #   name: Build Wheel
  #   runs-on: ${{ matrix.os }}
@ -50,7 +50,7 @@ jobs:
  #     matrix:
  #         os: ['ubuntu-20.04']
  #         python-version: ['3.9', '3.10', '3.11', '3.12']
-  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
  #         cuda-version: ['11.8', '12.1']
  #   steps:
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -1,6 +1,4 @@
 name: PR Reminder Comment Bot
 permissions:
  pull-requests: write
 on:
  pull_request_target:
    types: [opened]
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 # Install requirements
-$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
+$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
--- a/.github/workflows/scripts/create_release.js
+++ b/.github/workflows/scripts/create_release.js
@ -1,4 +1,4 @@
-// Uses GitHub's API to create the release and wait for result.
+// Uses Github's API to create the release and wait for result.
 // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 module.exports = async (github, context, core) => {
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,7 @@
 /vllm/_version.py
 # vllm-flash-attn built from source
-vllm/vllm_flash_attn/*
+vllm/vllm_flash_attn/
 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -77,6 +77,10 @@ instance/
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 docs/source/getting_started/examples/
 # PyBuilder
 .pybuilder/
 target/
@ -146,7 +150,6 @@ venv.bak/
 # mkdocs documentation
 /site
 docs/examples
 # mypy
 .mypy_cache/
@ -194,11 +197,8 @@ _build/
 hip_compat.h
 # Benchmark dataset
-benchmarks/**/*.json
+benchmarks/*.json
 # Linting
 actionlint
 shellcheck*/
 # Ingore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,6 +1,3 @@
 default_install_hook_types:
  - pre-commit
  - commit-msg
 default_stages:
  - pre-commit # Run locally
  - manual # Run in CI
@ -11,46 +8,44 @@ repos:
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.11.7
+  rev: v0.9.3
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.4.1
+  rev: v2.4.0
  hooks:
  - id: codespell
    additional_dependencies: ['tomli']
    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
+  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v20.1.3
+  rev: v19.1.7
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
 - repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.29
+  rev: v0.9.27
  hooks:
  - id: pymarkdown
    exclude: '.*\.inc\.md'
    args: [fix]
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.17
+  rev: 0.6.2
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+      args: [requirements-test.in, -o, requirements-test.txt]
-      files: ^requirements/test\.(in|txt)$
+      files: ^requirements-test\.(in|txt)$
 - repo: local
  hooks:
  - id: mypy-local
@ -58,7 +53,7 @@ repos:
    entry: tools/mypy.sh 0 "local"
    language: python
    types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
    stages: [pre-commit] # Don't run in CI
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
@ -104,8 +99,8 @@ repos:
    args:
      - -c
      - |
-        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
-          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
        fi
    language: system
    verbose: true
@ -124,25 +119,6 @@ repos:
    language: system
    always_run: true
    pass_filenames: false
  - id: update-dockerfile-graph
    name: Update Dockerfile dependency graph
    entry: tools/update-dockerfile-graph.sh
    language: script
  - id: enforce-import-regex-instead-of-re
    name: Enforce import regex as re
    entry: python tools/enforce_regex_import.py
    language: python
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
  # forbid directly import triton
  - id: forbid-direct-triton-import
    name: "Forbid direct 'import triton'"
    entry: python tools/check_triton_import.py
    language: python
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -8,10 +8,14 @@ build:
  tools:
    python: "3.12"
-mkdocs:
+sphinx:
-  configuration: mkdocs.yaml
+  configuration: docs/source/conf.py
  fail_on_warning: true
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats: []
 # Optionally declare the Python requirements required to build your docs
 python:
  install:
-    - requirements: requirements/docs.txt
+    - requirements: docs/requirements-docs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@ -29,8 +30,11 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
 #
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -40,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from docker/Dockerfile.rocm
+# versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
 #
 # Try to find python package with an executable that exactly matches
@ -76,15 +80,6 @@ endif()
 #
 find_package(Torch REQUIRED)
 # Supported NVIDIA architectures.
 # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 else()
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 endif()
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -232,34 +227,28 @@ endif()
 #
 set(VLLM_EXT_SRC
  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
  "csrc/mamba/causal_conv1d/causal_conv1d.cu"
  "csrc/cache_kernels.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
  "csrc/attention/merge_attn_states.cu"
  "csrc/attention/vertical_slash_index.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/quantization/activation_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
-  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
+  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
+  # Please keep this in sync with FetchContent_Declare line below.
  set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -277,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
        # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG ${CUTLASS_REVISION}
+        GIT_TAG v3.8.0
        GIT_PROGRESS TRUE
        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -289,16 +278,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  FetchContent_MakeAvailable(cutlass)
  list(APPEND VLLM_EXT_SRC
    "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/custom_all_reduce.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
-    "csrc/cutlass_extensions/common.cpp"
+    "csrc/cutlass_extensions/common.cpp")
    "csrc/attention/mla/cutlass_mla_entry.cu")
  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -307,143 +297,195 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  # 9.0 for latest bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
-  # Marlin kernels: generate and build for supported architectures
+  if (MARLIN_ARCHS)
-  optional_cuda_sources(
+    set(MARLIN_SRCS
-    NAME Marlin
+       "csrc/quantization/fp8/fp8_marlin.cu"
-    ARCHS "8.0;9.0+PTX"
+       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
-    GEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py"
+       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-    GEN_GLOB "csrc/quantization/gptq_marlin/kernel_*.cu"
+       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
-    SRCS
+       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
-      "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-      "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
-      "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
+    set_gencode_flags_for_srcs(
-      "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+      SRCS "${MARLIN_SRCS}"
-      "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+      CUDA_ARCHS "${MARLIN_ARCHS}")
-      "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
+    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
-  )
+    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
  else()
    message(STATUS "Not building Marlin kernels as no compatible archs found"
                   " in CUDA target architectures")
  endif()
-  # AllSpark kernels
+  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  optional_cuda_sources(
+  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
-    NAME AllSpark
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
-    ARCHS "8.0;8.6;8.7;8.9"
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    SRCS
+    set(SRCS
-      "csrc/quantization/gptq_allspark/allspark_repack.cu"
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
-      "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
-  )
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                     "later if you intend on running FP8 quantized models on "
                     "Hopper.")
    else()
      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
                     "in CUDA target architectures")
    endif()
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require CUDA 12.0 or later
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
-  optional_cuda_sources(
+    # build any 3x kernels
-    NAME scaled_mm_c3x_sm90
+    set(SCALED_MM_3X_ARCHS)
-    MIN_VERSION 12.0
+  endif()
    ARCHS "9.0a"
    SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu"
    FLAGS "-DENABLE_SCALED_MM_SM90=1"
    VERSION_MSG
      "Not building scaled_mm_c3x_sm90: CUDA Compiler version is not >= 12.0."
      "Please upgrade to CUDA 12.0 or later to run FP8 quantized models on Hopper."
  )
-  # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require CUDA 12.8 or later
+  #
-  optional_cuda_sources(
+  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
-    NAME scaled_mm_c3x_sm100
+  # kernels for the remaining archs that are not already built for 3x.
-    MIN_VERSION 12.8
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    ARCHS "10.0a;10.1a;12.0a"
+    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
-    SRCS
+  # subtract out the archs that are already built for 3x
-      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
+  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
+  if (SCALED_MM_2X_ARCHS)
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
-    FLAGS "-DENABLE_SCALED_MM_SM100=1"
+    set_gencode_flags_for_srcs(
-    VERSION_MSG
+      SRCS "${SRCS}"
-      "Not building scaled_mm_c3x_sm100: CUDA Compiler version is not >= 12.8."
+      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
-      "Please upgrade to CUDA 12.8 or later to run FP8 quantized models on Blackwell."
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
-  )
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
-
+    message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
-  # For the cutlass_scaled_mm kernels for Pre-hopper (c2x, i.e. CUTLASS 2.x)
+  else()
-  optional_cuda_sources(
+    if (SCALED_MM_3X_ARCHS)
-    NAME scaled_mm_c2x
+      message(STATUS "Not building scaled_mm_c2x as all archs are already built"
-    ARCHS "7.5;8.0;8.9+PTX"
+                     " for and covered by scaled_mm_c3x")
-    SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
+    else()
-    FLAGS "-DENABLE_SCALED_MM_C2X=1"
+      message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
-  )
+                    "in CUDA target architectures")
    endif()
  endif()
  #
  # 2:4 Sparse Kernels
-  optional_cuda_sources(
+
-    NAME sparse_scaled_mm_c3x
+  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-    MIN_VERSION 12.2
+  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
-    ARCHS "9.0a;"
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
-    SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu"
+    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
-    FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1"
+    set_gencode_flags_for_srcs(
-    VERSION_MSG
+      SRCS "${SRCS}"
-      "Not building sparse_scaled_mm_c3x: CUDA Compiler version is not >= 12.2."
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
-      "Please upgrade to CUDA 12.2 or later to run FP8 sparse quantized models on Hopper."
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
-  )
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
                     "if you intend on running FP8 sparse quantized models on Hopper.")
    else()
      message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
                     "in CUDA target architectures")
    endif()
  endif()
  # FP4 Archs and flags
-  optional_cuda_sources(
+  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
-    NAME NVFP4
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
-    MIN_VERSION 12.8
+    set(SRCS
    ARCHS "10.0a"
    SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
+    )
-    FLAGS "-DENABLE_NVFP4=1"
+    set_gencode_flags_for_srcs(
-  )
+      SRCS "${SRCS}"
-
+      CUDA_ARCHS "${FP4_ARCHS}")
-  # CUTLASS MLA Archs and flags
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
-  optional_cuda_sources(
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
-    NAME CUTLASS_MLA
+    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
-    MIN_VERSION 12.8
+  else()
-    ARCHS "10.0a"
+    message(STATUS "Not building NVFP4 as no compatible archs were found.")
-    SRCS "csrc/attention/mla/cutlass_mla_kernels.cu"
+    # clear FP4_ARCHS
-    FLAGS "-DENABLE_CUTLASS_MLA=1"
+    set(FP4_ARCHS)
-  )
+  endif()
  # Add MLA-specific include directories only to MLA source files
  set_source_files_properties(
    "csrc/attention/mla/cutlass_mla_kernels.cu"
    PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common"
  )
  # CUTLASS MoE kernels
  optional_cuda_sources(
    NAME grouped_mm_c3x
    MIN_VERSION 12.3
    ARCHS "9.0a;10.0a"
    SRCS
      "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
      "csrc/quantization/cutlass_w8a8/moe/moe_data.cu"
    FLAGS "-DENABLE_CUTLASS_MOE_SM90=1"
    VERSION_MSG 
      "Not building grouped_mm_c3x kernels as CUDA Compiler is less than 12.3."
      "We recommend upgrading to CUDA 12.3 or later if you intend on running FP8 quantized MoE models on Hopper."
  )
  #
  # Machete kernels
-  # Machete kernels: generate and build for supported architectures
+  # The machete kernels only work on hopper and require CUDA 12.0 or later.
  # Only build Machete kernels if we are building for something compatible with sm90a
  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
-  optional_cuda_sources(
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
-    NAME Machete
+    #
-    MIN_VERSION 12.0
+    # For the Machete kernels we automatically generate sources for various
-    ARCHS "${MACHETE_ARCHS}"
+    # preselected input type pairs and schedules.
-    GEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py"
+    # Generate sources:
-    GEN_PYTHONPATH_PREPEND 
+    set(MACHETE_GEN_SCRIPT
-      "${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/"
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
-    GEN_GLOB "csrc/quantization/machete/generated/*.cu"
+    file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
-    SRCS "csrc/quantization/machete/machete_pytorch.cu"
+
-    VERSION_MSG
+    message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
-      "Not building Machete kernels as CUDA Compiler version is less than 12.0."
+    message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
-      "We recommend upgrading to CUDA 12.0 or later to run w4a16 quantized models on Hopper."
+
-  )
+    if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
        RESULT_VARIABLE machete_generation_result
        OUTPUT_VARIABLE machete_generation_output
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
      )
      if (NOT machete_generation_result EQUAL 0)
        message(FATAL_ERROR "Machete generation failed."
                            " Result: \"${machete_generation_result}\""
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
      else()
        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
            CACHE STRING "Last run machete generate script hash" FORCE)
        message(STATUS "Machete generation completed successfully.")
      endif()
    else()
      message(STATUS "Machete generation script has not changed, skipping generation.")
    endif()
    # Add machete generated sources
    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
    # forward compatible
    set_gencode_flags_for_srcs(
      SRCS "${MACHETE_GEN_SOURCES}"
      CUDA_ARCHS "${MACHETE_ARCHS}")
    list(APPEND VLLM_EXT_SRC
      csrc/quantization/machete/machete_pytorch.cu)
    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
        AND MACHETE_ARCHS)
      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                     "later if you intend on running w4a16 quantized models on "
                     "Hopper.")
    else()
      message(STATUS "Not building Machete kernels as no compatible archs "
                     "found in CUDA target architectures")
    endif()
  endif()
 # if CUDA endif
 endif()
@ -456,7 +498,6 @@ define_gpu_extension_target(
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)
@ -475,37 +516,35 @@ set(VLLM_MOE_EXT_SRC
  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
 endif()
 # Apply gencode flags to base MOE extension sources
 set_gencode_flags_for_srcs(
  SRCS "${VLLM_MOE_EXT_SRC}"
  CUDA_ARCHS "${CUDA_ARCHS}")
 ## Marlin MOE kernels: generate and include for supported architectures
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  optional_cuda_sources(
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
-    NAME "Marlin MOE"
+  if (MARLIN_MOE_ARCHS)
-    ARCHS "8.0;9.0+PTX"
+    set(MARLIN_MOE_SRC
-    GEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
-    GEN_GLOB "csrc/moe/marlin_moe_wna16/*.cu"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
-    OUT_SRCS_VAR VLLM_MOE_EXT_SRC
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
-  )
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
        "csrc/moe/marlin_moe_ops.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_MOE_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                   " in CUDA target architectures")
  endif()
 endif()
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  set(MOE_PERMUTE_SRC
      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
      "csrc/moe/moe_permute_unpermute_op.cu")
  set_gencode_flags_for_srcs(
    SRCS "${MARLIN_PERMUTE_SRC}"
    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
 endif()
 message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
@ -514,8 +553,6 @@ define_gpu_extension_target(
  SOURCES ${VLLM_MOE_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)
@ -525,7 +562,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  set(VLLM_ROCM_EXT_SRC
    "csrc/rocm/torch_bindings.cpp"
    "csrc/rocm/skinny_gemms.cu"
    "csrc/rocm/attention.cu")
  define_gpu_extension_target(
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,3 +1,3 @@
 # Contributing to vLLM
-You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -2,14 +2,14 @@
 # to run the OpenAI compatible server.
 # Please update any changes made here to
-# docs/contributing/dockerfile/dockerfile.md and
+# docs/source/contributing/dockerfile/dockerfile.md and
-# docs/assets/contributing/dockerfile-stages-dependency.png
+# docs/source/assets/contributing/dockerfile-stages-dependency.png
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
@ -19,10 +19,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo \
-    && for i in 1 2 3; do \
+    && add-apt-repository ppa:deadsnakes/ppa \
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
    done \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
@ -34,11 +31,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 RUN --mount=type=cache,target=/root/.cache/uv \
    python3 -m pip install uv
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
 RUN apt-get install -y gcc-10 g++-10
@ -63,21 +55,19 @@ WORKDIR /workspace
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
    fi
-COPY requirements/common.txt requirements/common.txt
+COPY requirements-common.txt requirements-common.txt
-COPY requirements/cuda.txt requirements/cuda.txt
+COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/cuda.txt \
+    uv pip install --system -r requirements-cuda.txt
    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
 # see https://github.com/pytorch/pytorch/pull/123243
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # Override the arch list for flash-attn to reduce the binary size
 ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
@ -89,21 +79,15 @@ FROM base AS build
 ARG TARGETPLATFORM
 # install build dependencies
-COPY requirements/build.txt requirements/build.txt
+COPY requirements-build.txt requirements-build.txt
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt \
+    uv pip install --system -r requirements-build.txt
    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@ -140,9 +124,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git  \
    if [ "$USE_SCCACHE" != "1" ]; then \
        # Clean any existing CMake artifacts
        rm -rf .deps && \
        mkdir -p .deps && \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi
@ -162,35 +143,23 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
 #################### DEV IMAGE ####################
 FROM base as dev
-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+COPY requirements-lint.txt requirements-lint.txt
-# Reference: https://github.com/astral-sh/uv/pull/1694
+COPY requirements-test.txt requirements-test.txt
-ENV UV_HTTP_TIMEOUT=500
+COPY requirements-dev.txt requirements-dev.txt
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Workaround for #17068
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
+    uv pip install --system -r requirements-dev.txt
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/dev.txt \
    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 # TODO: Restore to base image after FlashInfer AOT wheel fixed
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 SHELL ["/bin/bash", "-c"]
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@ -200,10 +169,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && for i in 1 2 3; do \
+    && add-apt-repository ppa:deadsnakes/ppa \
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
    done \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
@ -215,11 +181,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 RUN --mount=type=cache,target=/root/.cache/uv \
    python3 -m pip install uv
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
@ -232,15 +193,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
    fi
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose \
+    uv pip install --system dist/*.whl --verbose
    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@ -257,34 +216,17 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    # FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
+    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
    if [[ "$CUDA_VERSION" == 12.8* ]]; then \
        uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \
    else \
        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
        CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
        if [ "$CUDA_MAJOR" -lt 12 ]; then \
            export FLASHINFER_ENABLE_SM90=0; \
        fi; \
        uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
    fi \
 fi
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .
-RUN --mount=type=cache,target=/root/.cache/uv \
+# Although we build Flashinfer with AOT mode, there's still
 . /etc/environment && \
 uv pip list
 # Even when we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
 # install build dependencies for JIT compilation.
 # TODO: Remove this once FlashInfer AOT wheel is fixed
-COPY requirements/build.txt requirements/build.txt
+COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt \
+    uv pip install --system -r requirements-build.txt
    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### vLLM installation IMAGE ####################
@ -295,21 +237,9 @@ FROM vllm-base AS test
 ADD . /vllm-workspace/
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Workaround for #17068
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \   
+RUN --mount=type=cache,target=/root/.cache/uv \
-    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
+    uv pip install --system -r requirements-dev.txt
    if [ "$CUDA_MAJOR" -ge 12 ]; then \
        uv pip install --system -r requirements/dev.txt; \
    fi
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
@ -328,26 +258,19 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 # will not be imported by other tests
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN cp -r examples test_docs/
 RUN mv vllm test_docs/
 RUN mv mkdocs.yaml test_docs/
 #################### TEST IMAGE ####################
 #################### OPENAI API SERVER ####################
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
 ARG TARGETPLATFORM
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    fi
 ENV VLLM_USAGE_SOURCE production-docker-image
--- a/docker/Dockerfile.arm
+++ b/docker/Dockerfile.arm
@ -26,18 +26,18 @@ WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
    pip install --upgrade pip && \
-    pip install -r requirements/build.txt
+    pip install -r requirements-build.txt
 FROM cpu-test-arm AS build
 WORKDIR /workspace/vllm
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
-    pip install -v -r requirements/cpu.txt
+    pip install -v -r requirements-cpu.txt
 COPY . .
 ARG GIT_REPO_CHECK=0
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -0,0 +1,69 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
 FROM ubuntu:22.04 AS cpu-test-1
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 RUN --mount=type=cache,target=/var/cache/apt \
    apt-get update -y \
    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install intel-openmp==2025.0.1
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 RUN pip install intel_extension_for_pytorch==2.5.0
 WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
    pip install --upgrade pip && \
    pip install -r requirements-build.txt
 FROM cpu-test-1 AS build
 WORKDIR /workspace/vllm
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
    pip install -v -r requirements-cpu.txt
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
    pip install dist/*.whl && \
    rm -rf dist
 WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
@ -1,10 +1,10 @@
-FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
-RUN pip install -v -r requirements/hpu.txt
+RUN pip install -v -r requirements-hpu.txt
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
 FROM $BASE_IMAGE
@ -21,8 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
-RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
 RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install pytest
 # uninstall transformers-neuronx package explicitly to avoid version conflict
@ -35,7 +36,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 RUN python3 -m pip install -U \
        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements/neuron.txt
+        -r requirements-neuron.txt
 ENV VLLM_TARGET_DEVICE neuron
 RUN --mount=type=bind,source=.git,target=.git \
@ -48,8 +49,6 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 # FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
 RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
 RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
 # overwrite entrypoint to run bash script
 RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -0,0 +1,29 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 FROM ubuntu:22.04 AS dev
 RUN apt-get update -y && \
    apt-get install -y \
        git python3-pip \
        ffmpeg libsm6 libxext6 libgl1
 WORKDIR /workspace
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 RUN python3 -m pip install -U pip
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
 # build vLLM with OpenVINO backend
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
 COPY examples/ /workspace/examples
 COPY benchmarks/ /workspace/benchmarks
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -0,0 +1,37 @@
 FROM mambaorg/micromamba
 ARG MAMBA_DOCKERFILE_ACTIVATE=1
 USER root
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
 RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 RUN --mount=type=cache,target=/root/.cache/pip  \
    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
        -r requirements-cpu.txt \
        xformers uvloop==0.20.0
 RUN --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py install
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -12,8 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
-    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
    apt-transport-https ca-certificates wget curl
 # Remove sccache    
 RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
@ -39,14 +38,14 @@ FROM fetch_vllm AS build_vllm
 ARG USE_CYTHON
 # Build vLLM
 RUN cd vllm \
-    && python3 -m pip install -r requirements/rocm.txt \
+    && python3 -m pip install -r requirements-rocm.txt \
    && python3 setup.py clean --all  \
-    && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
+    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
    && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
-COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
@ -61,8 +60,7 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 # Install vLLM
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    cd /install \
-    && pip install -U -r requirements/rocm.txt \
+    && pip install -U -r requirements-rocm.txt \
    && pip install -U -r requirements/rocm-test.txt \
    && pip uninstall -y vllm \
    && pip install *.whl
@ -101,7 +99,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \
 # Install vLLM
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    cd /install \
-    && pip install -U -r requirements/rocm.txt \
+    && pip install -U -r requirements-rocm.txt \
    && pip uninstall -y vllm \
    && pip install *.whl
@ -114,16 +112,8 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 # ENV that can improve safe tensor loading, and end-to-end time
 ENV SAFETENSORS_FAST_GPU=1
 # User-friendly environment setting for multi-processing to avoid below RuntimeError.
 # RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
 # you must use the 'spawn' start method 
 # See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 CMD ["/bin/bash"]
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,26 +1,24 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="db8e93b4"
+ARG HIPBLASLT_BRANCH="4d40e36"
 ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 ARG LEGACY_HIPBLASLT_OPTION=
 ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="295f2ed4"
+ARG PYTORCH_BRANCH="3a585126"
-ARG PYTORCH_VISION_BRANCH="v0.21.0"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="1a7f4dfa"
+ARG FA_BRANCH="b7d29fb"
-ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
 ARG AITER_BRANCH="c1debd8"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 FROM ${BASE_IMAGE} AS base
 ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 ARG PYTHON_VERSION=3.12
@ -31,11 +29,8 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
+    && apt-get install -y software-properties-common git curl sudo vim less \
-    && for i in 1 2 3; do \
+    && add-apt-repository ppa:deadsnakes/ppa \
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
    done \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
@ -45,7 +40,7 @@ RUN apt-get update -y \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
-RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@ -63,8 +58,7 @@ RUN cd hipBLAS-common \
 RUN git clone https://github.com/ROCm/hipBLASLt
 RUN cd hipBLASLt \
    && git checkout ${HIPBLASLT_BRANCH} \
-    && apt-get install -y llvm-dev \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
    && cd build/release \
    && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@ -114,24 +108,11 @@ RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
-    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/flash-attention/dist/*.whl /app/install
 FROM base AS build_aiter
 ARG AITER_BRANCH
 ARG AITER_REPO
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN git clone --recursive ${AITER_REPO}
 RUN cd aiter \
    && git checkout ${AITER_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt
 RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
@ -147,11 +128,8 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    pip install /install/*.whl
 ARG BASE_IMAGE
 ARG HIPBLAS_COMMON_BRANCH
 ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
@ -164,8 +142,6 @@ ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
 ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@ -179,5 +155,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@ -15,15 +15,12 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 # Remove existing versions of dependencies
 RUN pip uninstall -y torch torch_xla torchvision
 ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
    python3 -m pip install \
-        -r requirements/tpu.txt
+        -r requirements-tpu.txt
-RUN python3 -m pip install -e .
+RUN python3 setup.py develop
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -0,0 +1,69 @@
 FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 RUN apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
    curl \
    ffmpeg \
    git \
    libsndfile1 \
    libsm6 \
    libxext6 \
    libgl1 \
    lsb-release \
    numactl \
    python3 \
    python3-dev \
    python3-pip \
    # vim \
    wget
 WORKDIR /workspace/vllm
 COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
 COPY requirements-common.txt /workspace/vllm/requirements-common.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install --no-cache-dir \
    -r requirements-xpu.txt
 RUN git clone https://github.com/intel/pti-gpu && \
    cd pti-gpu/sdk && \
    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
    mkdir build && \
    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
    make -j && \
    cmake --install . --config Release --prefix "/usr/local"
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
 COPY . .
 ARG GIT_REPO_CHECK
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 ENV VLLM_TARGET_DEVICE=xpu
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
    python3 setup.py install
 CMD ["/bin/bash"]
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install accelerate hf_transfer 'modelscope!=1.15.0'
 ENV VLLM_USAGE_SOURCE production-docker-image \
    TRITON_XPU_PROFILE 1
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +1,9 @@
 include LICENSE
-include requirements/common.txt
+include requirements-common.txt
-include requirements/cuda.txt
+include requirements-cuda.txt
-include requirements/rocm.txt
+include requirements-rocm.txt
-include requirements/neuron.txt
+include requirements-neuron.txt
-include requirements/cpu.txt
+include requirements-cpu.txt
 include CMakeLists.txt
 recursive-include cmake *
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 <p align="center">
  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
-    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
+    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
  </picture>
 </p>
@ -10,24 +10,18 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 ---
 We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9)
 ---
 *Latest News* 🔥
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
+
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 <details>
 <summary>Previous News</summary>
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
@ -43,9 +37,8 @@ Easy, fast, and cheap LLM serving for everyone
 - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
 </details>
 ---
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.
@ -58,7 +51,7 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
+- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
 - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
 - Speculative decoding
 - Chunked prefill
@ -74,7 +67,7 @@ vLLM is flexible and easy to use with:
 - OpenAI-compatible API server
 - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
 - Prefix caching support
- Multi-LoRA support
+- Multi-lora support
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
@ -93,21 +86,21 @@ pip install vllm
 ```
 Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
+- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
 - [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
 - [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
 ## Contributing
 We welcome and value any contributions and collaborations.
-Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
+Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
 ## Sponsors
 vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
 <!-- Note: Please sort them in alphabetical order. -->
-<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
+<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
 Cash Donations:
 - a16z
 - Dropbox
@ -123,7 +116,6 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
 - Intel
 - Lambda Lab
 - Nebius
 - Novita AI
@ -154,11 +146,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
+- For technical questions and feature requests, please use Github issues or discussions.
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
+- For discussing with fellow users and coordinating contributions and development, please use Slack.
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For security disclosures, please use Github's security advisory feature.
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
+- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
 - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
 ## Media Kit
--- a/RELEASE.md
+++ b/RELEASE.md
@ -1,54 +0,0 @@
 # Releasing vLLM
 vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
 ## Release Versioning
 vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
 * _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
 * _minor_ major features
 * _patch_ features and backwards-compatible bug fixes
 * _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
 ## Release Cadence
 Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
 Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
 | Release Date | Patch release versions | Post Release versions |
 | --- | --- | --- |
 | Jan 2025 | 0.7.0 | --- |
 | Feb 2025 | 0.7.1, 0.7.2, 0.7.3  | --- |
 | Mar 2025 | 0.7.4, 0.7.5 | --- |
 | Apr 2025 | 0.7.6, 0.7.7 | --- |
 | May 2025 | 0.7.8, 0.7.9 | --- |
 | Jun 2025 | 0.7.10, 0.7.11 | --- |
 | Jul 2025 | 0.7.12, 0.7.13 | --- |
 | Aug 2025 | 0.7.14, 0.7.15 | --- |
 | Sep 2025 | 0.7.16, 0.7.17 | --- |
 | Oct 2025 | 0.7.18, 0.7.19 | --- |
 | Nov 2025 | 0.7.20, 0.7.21 | --- |
 | Dec 2025 | 0.7.22, 0.7.23 | --- |
 ## Release branch
 Each release is built from a dedicated release branch.
 * For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
 * For post releases, previously cut release branch is reused
 * Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
 * Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
 * After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
 ## Release Cherry-Pick Criteria
 After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
 * Regression fixes - that address functional/performance regression against the most recent release (e.g. 0.7.0 for 0.7.1 release)
 * Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
 * Fixes to new features introduced in the most recent release (e.g. 0.7.0 for 0.7.1 release)
 * Documentation improvements
 * Release branch specific changes (e.g. change version identifiers or CI fixes)
 Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -1,341 +1,29 @@
 # Benchmarking vLLM
-This README guides you through running benchmark tests with the extensive
+## Downloading the ShareGPT dataset
 datasets supported on vLLM. It’s a living document, updated as new features and datasets
 become available.
-## Dataset Overview
+You can download the dataset by running:
 <table style="width:100%; border-collapse: collapse;">
  <thead>
    <tr>
      <th style="width:15%; text-align: left;">Dataset</th>
      <th style="width:10%; text-align: center;">Online</th>
      <th style="width:10%; text-align: center;">Offline</th>
      <th style="width:65%; text-align: left;">Data Path</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><strong>ShareGPT</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
    </tr>
    <tr>
      <td><strong>BurstGPT</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
    </tr>
    <tr>
      <td><strong>Sonnet</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td>Local file: <code>benchmarks/sonnet.txt</code></td>
    </tr>
    <tr>
      <td><strong>Random</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>synthetic</code></td>
    </tr>
    <tr>
      <td><strong>HuggingFace-VisionArena</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>lmarena-ai/VisionArena-Chat</code></td>
    </tr>
    <tr>
      <td><strong>HuggingFace-InstructCoder</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>likaixin/InstructCoder</code></td>
    </tr>
      <tr>
      <td><strong>HuggingFace-AIMO</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
    </tr>
    <tr>
      <td><strong>HuggingFace-Other</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
    </tr>
  </tbody>
 </table>
 ✅: supported
 🟡: Partial support
 🚧: to be supported
 **Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
 ---
 ## Example - Online Benchmark
 First start serving your model
 ```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
-Then run the benchmarking script
+## Downloading the ShareGPT4V dataset
 The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
 will ignore a datapoint if the referred image is missing.
 ```bash
-# download dataset
+wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+mkdir coco -p
-python3 vllm/benchmarks/benchmark_serving.py \
+wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
-  --backend vllm \
+unzip coco/train2017.zip -d coco/
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --endpoint /v1/completions \
  --dataset-name sharegpt \
  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
  --num-prompts 10
 ```
-If successful, you will see the following output
+# Downloading the BurstGPT dataset
-```
+You can download the BurstGPT v1.1 dataset by running:
 ============ Serving Benchmark Result ============
 Successful requests:                     10        
 Benchmark duration (s):                  5.78      
 Total input tokens:                      1369      
 Total generated tokens:                  2212      
 Request throughput (req/s):              1.73      
 Output token throughput (tok/s):         382.89    
 Total Token throughput (tok/s):          619.85    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          71.54     
 Median TTFT (ms):                        73.88     
 P99 TTFT (ms):                           79.49     
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          7.91      
 Median TPOT (ms):                        7.96      
 P99 TPOT (ms):                           8.03      
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           7.74      
 Median ITL (ms):                         7.70      
 P99 ITL (ms):                            8.39      
 ==================================================
 ```
 ### VisionArena Benchmark for Vision Language Models
 ```bash
-# need a model with vision capability here
+wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 ```bash
 python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
  --dataset-path lmarena-ai/VisionArena-Chat \
  --hf-split train \
  --num-prompts 1000
 ```
 ### InstructCoder Benchmark with Speculative Decoding
 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
    --ngram_prompt_lookup_min 2 \
    --ngram-prompt-lookup-max 5 \
    --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
 ```
 ``` bash
 python3 benchmarks/benchmark_serving.py \
    --model meta-llama/Meta-Llama-3-8B-Instruct \
    --dataset-name hf \
    --dataset-path likaixin/InstructCoder \
    --num-prompts 2048
 ```
 ### Other HuggingFaceDataset Examples
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 **`lmms-lab/LLaVA-OneVision-Data`**
 ```bash
 python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
  --dataset-path lmms-lab/LLaVA-OneVision-Data \
  --hf-split train \
  --hf-subset "chart2text(cauldron)" \
  --num-prompts 10
 ```
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 ```bash
 python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
  --hf-split train \
  --num-prompts 10
 ```
 **`AI-MO/aimo-validation-aime`**
 ``` bash
 python3 vllm/benchmarks/benchmark_serving.py \
    --model Qwen/QwQ-32B \
    --dataset-name hf \
    --dataset-path AI-MO/aimo-validation-aime \
    --num-prompts 10 \
    --seed 42
 ```
 ### Running With Sampling Parameters
 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
 ```bash
 python3 vllm/benchmarks/benchmark_serving.py \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --endpoint /v1/completions \
  --dataset-name sharegpt \
  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
  --top-k 10 \
  --top-p 0.9 \
  --temperature 0.5 \
  --num-prompts 10
 ```
 ---
 ## Example - Offline Throughput Benchmark
 ```bash
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --dataset-name sonnet \
  --dataset-path vllm/benchmarks/sonnet.txt \
  --num-prompts 10
 ```
 If successful, you will see the following output
 ```
 Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
 Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```
 ### VisionArena Benchmark for Vision Language Models
 ``` bash
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
  --dataset-path lmarena-ai/VisionArena-Chat \
  --num-prompts 1000 \
  --hf-split train
 ```
 The `num prompt tokens` now includes image token counts
 ```
 Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
 Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 ### InstructCoder Benchmark with Speculative Decoding
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
 VLLM_USE_V1=1 \
 python3 vllm/benchmarks/benchmark_throughput.py \
    --dataset-name=hf \
    --dataset-path=likaixin/InstructCoder \
    --model=meta-llama/Meta-Llama-3-8B-Instruct \
    --input-len=1000 \
    --output-len=100 \
    --num-prompts=2048 \
    --async-engine \
    --ngram_prompt_lookup_min=2 \
    --ngram-prompt-lookup-max=5 \
    --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
 ```
 ```
 Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
 Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```
 ### Other HuggingFaceDataset Examples
 **`lmms-lab/LLaVA-OneVision-Data`**
 ```bash
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
  --dataset-path lmms-lab/LLaVA-OneVision-Data \
  --hf-split train \
  --hf-subset "chart2text(cauldron)" \
  --num-prompts 10
 ```
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 ```bash
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
  --hf-split train \
  --num-prompts 10
 ```
 **`AI-MO/aimo-validation-aime`**
 ```bash
 python3 benchmarks/benchmark_throughput.py \
  --model Qwen/QwQ-32B \
  --backend vllm \
  --dataset-name hf \
  --dataset-path AI-MO/aimo-validation-aime \
  --hf-split train \
  --num-prompts 10
 ```
 ### Benchmark with LoRA Adapters
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model meta-llama/Llama-2-7b-hf \
  --backend vllm \
  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
  --dataset_name sharegpt \
  --num-prompts 10 \
  --max-loras 2 \
  --max-lora-rank 8 \
  --enable-lora \
  --lora-path yard1/llama-2-7b-sql-lora-test
  ```
--- a/Show More
+++ b/Show More
`@ -1,3 +1,3 @@`
	`# Contributing to vLLM`	`# Contributing to vLLM`

	`You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).`	`You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).`