Merge branch 'main' into moondream2

moondream text model
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-20 08:10:52 +00:00 · 2025-01-10 06:12:27 +00:00
2095 changed files with 52649 additions and 226986 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,14 +1,9 @@
-# SPDX-License-Identifier: Apache-2.0
-
 import os
 import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
-# Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))


 def print_top_10_largest_files(zip_file):
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 import argparse
 import os

--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@ -1,4 +1,3 @@
-# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@ -1,4 +1,3 @@
-# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -1,12 +1,11 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.231
+    value: 0.233
  - name: "exact_match,flexible-extract"
-    value: 0.22
+    value: 0.236
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@ -1,5 +1,4 @@
-# For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
-model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.30
-  - name: "exact_match,flexible-extract"
-    value: 0.465
-limit: 1319
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@ -1,4 +1,3 @@
-# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 tasks:
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
-model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.6353
-  - name: "exact_match,flexible-extract"
-    value: 0.637
-limit: null
-num_fewshot: null 
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+Minitron-4B-Base-FP8.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -1,4 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
@ -13,10 +12,9 @@ from pathlib import Path

 import lm_eval
 import numpy
-import pytest
 import yaml

-RTOL = 0.08
+RTOL = 0.05
 TEST_DATA_FILE = os.environ.get(
    "LM_EVAL_TEST_DATA_FILE",
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
@ -47,10 +45,6 @@ def test_lm_eval_correctness():
    eval_config = yaml.safe_load(
        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))

-    if eval_config[
-            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
-        pytest.skip("FBGEMM is currently failing on main.")
-
    # Launch eval requests.
    results = launch_lm_eval(eval_config)

--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -1,13 +1,15 @@
 # vLLM benchmark suite

+
 ## Introduction

 This directory contains two sets of benchmark for vllm.
-
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.

-See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+
+See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+

 ## Performance benchmark quick overview

@ -17,14 +19,17 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc

 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.

+
 ## Nightly benchmark quick overview

-**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
+**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. 

 **Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.

 **Benchmarking Duration**: about 3.5hrs.

+
+
 ## Trigger the benchmark

 Performance benchmark will be triggered when:
@ -34,11 +39,16 @@ Performance benchmark will be triggered when:
 Nightly benchmark will be triggered when:
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.

+
+
+
 ## Performance benchmark details

+
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.

-### Latency test
+
+#### Latency test

 Here is an example of one test inside `latency-tests.json`:

@ -58,25 +68,23 @@ Here is an example of one test inside `latency-tests.json`:
 ```

 In this example:
-
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+-  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
+-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`

 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.

 WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.

-### Throughput test

+#### Throughput test
 The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.

 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.

-### Serving test
-
+#### Serving test
 We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:

-```json
+```
 [
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
@ -101,7 +109,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
 ```

 Inside this example:
-
 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
 - The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
@ -111,33 +118,36 @@ The number of this test is less stable compared to the delay and latency benchma

 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.

-### Visualizing the results
-
+#### Visualizing the results
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.

+
+
 ## Nightly test details

 See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.

-### Workflow

- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
+#### Workflow
+
+- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. 
 - Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
 - The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
 - At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.

-### Nightly tests
+#### Nightly tests

 In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.

-### Docker containers
+#### Docker containers

 The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.

 WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.

 WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
+
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -10,18 +10,12 @@ steps:
          - image: badouralix/curl-jq
            command:
            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-  - label: "Cleanup H100"
-    agents:
-      queue: H100
-    depends_on: ~
-    command: docker system prune -a --volumes --force
-  
+
  - label: "A100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
    depends_on: wait-for-container-image
-    if: build.branch == "main"
    plugins:
    - kubernetes:
        podSpec:
@ -56,7 +50,6 @@ steps:
    agents:
      queue: H200
    depends_on: wait-for-container-image
-    if: build.branch == "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@ -82,7 +75,6 @@ steps:
    agents:
      queue: H100
    depends_on: wait-for-container-image
-    if: build.branch == "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@ -98,87 +90,3 @@ steps:
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
-
-  # Premerge benchmark
-  - label: "A100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: A100
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-
-  - label: "H200"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H200
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: 4,5,6,7
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  #- block: "Run H100 Benchmark"
-    #key: block-h100
-    #depends_on: ~
-
-  - label: "H100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H100
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -9,19 +9,20 @@ This file contains the downloading link for benchmarking results.

 Please download the visualization scripts in the post

+
 ## Results reproduction

 - Find the docker we use in `benchmarking pipeline`
 - Deploy the docker, and inside the docker:
-  - Download `nightly-benchmarks.zip`.
-  - In the same folder, run the following code:
-
-  ```console
-  export HF_TOKEN=<your HF token>
-  apt update
-  apt install -y git
-  unzip nightly-benchmarks.zip
-  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-  ```
+  - Download `nightly-benchmarks.zip`. 
+  - In the same folder, run the following code
+```
+export HF_TOKEN=<your HF token>
+apt update
+apt install -y git
+unzip nightly-benchmarks.zip
+VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+```

 And the results will be inside `./benchmarks/results`.
+
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -2,7 +2,6 @@
 # Nightly benchmark

 This benchmark aims to:
-
 - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
 - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.

@ -10,6 +9,7 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html)

 Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)

+
 ## Setup

 - Docker images:
@ -33,7 +33,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).

-## Known issues
+# Known issues

 - TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
- TGI does not support `ignore-eos` flag.
+- TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@ -7,8 +7,10 @@
 - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).

+
 {latency_tests_markdown_table}

+
 ## Throughput tests

 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
@ -17,8 +19,10 @@
 - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.

+
 {throughput_tests_markdown_table}

+
 ## Serving tests

 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
@ -29,11 +33,13 @@
 - We also added a speculative decoding test for llama-3 70B, under QPS 2
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).

+
 {serving_tests_markdown_table}

+
 ## json version of the benchmarking tables

-This section contains the data of the markdown tables above in JSON format.
+This section contains the data of the markdown tables above in JSON format. 
 You can load the benchmarking tables into pandas dataframes as follows:

 ```python
@ -48,9 +54,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
 ```

 The json string for all benchmarking tables:
-
 ```json
 {benchmarking_results_in_json_string}
 ```

 You can also check the raw experiment data in the Artifact tab of the Buildkite page.
+
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 import json
 import os
 from pathlib import Path
@ -84,13 +82,8 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_serving.py`

            # attach the benchmarking command to raw_result
-            try:
-                with open(test_file.with_suffix(".commands")) as f:
-                    command = json.loads(f.read())
-            except OSError as e:
-                print(e)
-                continue
-
+            with open(test_file.with_suffix(".commands")) as f:
+                command = json.loads(f.read())
            raw_result.update(command)

            # update the test name of this result
@ -104,13 +97,8 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_latency.py`

            # attach the benchmarking command to raw_result
-            try:
-                with open(test_file.with_suffix(".commands")) as f:
-                    command = json.loads(f.read())
-            except OSError as e:
-                print(e)
-                continue
-
+            with open(test_file.with_suffix(".commands")) as f:
+                command = json.loads(f.read())
            raw_result.update(command)

            # update the test name of this result
@ -131,13 +119,8 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_throughput.py`

            # attach the benchmarking command to raw_result
-            try:
-                with open(test_file.with_suffix(".commands")) as f:
-                    command = json.loads(f.read())
-            except OSError as e:
-                print(e)
-                continue
-
+            with open(test_file.with_suffix(".commands")) as f:
+                command = json.loads(f.read())
            raw_result.update(command)

            # update the test name of this result
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 import argparse

 from transformers import AutoTokenizer
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 import argparse
 import json
 from pathlib import Path
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 from lmdeploy.serve.openai.api_client import APIClient

 api_client = APIClient("http://localhost:8000")
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -426,7 +426,7 @@ main() {

  pip install -U transformers

-  pip install -r requirements/dev.txt
+  pip install -r requirements-dev.txt
  which genai-perf

  # check storage
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -10,24 +10,15 @@ set -x
 set -o pipefail

 check_gpus() {
-  if command -v nvidia-smi; then
-    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
-  fi
-
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  if command -v nvidia-smi; then
-    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
-  elif command -v amd-smi; then
-    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
-  fi
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }

@ -99,15 +90,9 @@ kill_gpu_processes() {


  # wait until GPU memory usage smaller than 1GB
-  if command -v nvidia-smi; then
-    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-      sleep 1
-    done
-  elif command -v amd-smi; then
-    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
-      sleep 1
-    done
-  fi
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done

  # remove vllm config file
  rm -rf ~/.config/vllm
@ -324,14 +309,11 @@ run_serving_tests() {

      new_test_name=$test_name"_qps_"$qps

-      # pass the tensor parallel size to the client so that it can be displayed
-      # on the benchmark dashboard
      client_command="python3 benchmark_serving.py \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
-        --metadata "tensor_parallel_size=$tp" \
        $client_args"

      echo "Running test case $test_name with qps $qps"
@ -363,11 +345,6 @@ main() {
  check_gpus
  check_hf_token

-  # Set to v1 to run v1 benchmark
-  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
-    export VLLM_USE_V1=1
-  fi
-
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
@ -376,7 +353,7 @@ main() {
  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOGGING_LEVEL="WARNING"
+  export VLLM_LOG_LEVEL="WARNING"

  # prepare for benchmarking
  cd benchmarks || exit 1
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 import datetime
 import json
 import os
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -1,10 +1,6 @@
 #!/bin/sh
 TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
-if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
-    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
-else
-    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
-fi
+URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"

 TIMEOUT_SECONDS=10

--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@ -29,4 +29,4 @@
            "num-iters": 15
        }
    }
-]
+]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -63,12 +63,11 @@
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "speculative_config": {
-                "model": "turboderp/Qwama-0.5B-Instruct",
-                "num_speculative_tokens": 4,
-                "draft_tensor_parallel_size": 1
-            }
+            "swap_space": 16, 
+            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
+            "num_speculative_tokens": 4,
+            "speculative_draft_tensor_parallel_size": 1,
+            "use_v2_block_manager": ""
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@ -32,4 +32,4 @@
            "backend": "vllm"
        }
    }
-]
+]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,23 +1,12 @@
 steps:
-  - label: "Build wheel - CUDA 12.4"
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
  - label: "Build wheel - CUDA 12.1"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -31,10 +20,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -48,7 +37,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Build and publish TPU release image"
@ -57,7 +46,7 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
@ -67,11 +56,6 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  - input: "Provide Release version here"
-    fields:
-      - text: "What is the release version?"
-        key: "release-version"
-
  - block: "Build CPU release image"
    key: block-cpu-release-image-build
    depends_on: ~
@ -82,22 +66,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - block: "Build Neuron release image"
-    key: block-neuron-release-image-build
-    depends_on: ~
-
-  - label: "Build and publish Neuron release image"
-    depends_on: block-neuron-release-image-build
-    agents:
-      queue: neuron-postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -77,6 +77,7 @@ echo "Commands:$commands"
 #ignore certain kernels tests
 if [[ $commands == *" kernels "* ]]; then
  commands="${commands} \
+  --ignore=kernels/test_attention.py \
  --ignore=kernels/test_attention_selector.py \
  --ignore=kernels/test_blocksparse_attention.py \
  --ignore=kernels/test_causal_conv1d.py \
@ -91,61 +92,19 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_moe.py \
  --ignore=kernels/test_prefix_prefill.py \
  --ignore=kernels/test_rand.py \
-  --ignore=kernels/test_sampler.py \
-  --ignore=kernels/test_cascade_flash_attn.py \
-  --ignore=kernels/test_mamba_mixer2.py \
-  --ignore=kernels/test_aqlm.py \
-  --ignore=kernels/test_machete_mm.py \
-  --ignore=kernels/test_mha_attn.py \
-  --ignore=kernels/test_block_fp8.py \
-  --ignore=kernels/test_cutlass_moe.py \
-  --ignore=kernels/test_mamba_ssm_ssd.py \
-  --ignore=kernels/test_attention.py \
-  --ignore=kernels/test_block_int8.py \
-  --ignore=kernels/test_fused_quant_layernorm.py \
-  --ignore=kernels/test_int8_kernel.py \
-  --ignore=kernels/test_triton_moe_ptpc_fp8.py \
-  --ignore=kernels/test_permute_cols.py"
+  --ignore=kernels/test_sampler.py"
 fi

-#ignore certain Entrypoints/openai tests
+#ignore certain Entrypoints tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_accuracy.py \
  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_shutdown.py \
-  --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_sleep.py \
-  --ignore=entrypoints/openai/test_models.py \
-  --ignore=entrypoints/openai/test_lora_adapters.py \
-  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-  --ignore=entrypoints/openai/test_root_path.py \
-  --ignore=entrypoints/openai/test_tokenization.py \
-  --ignore=entrypoints/openai/test_prompt_validation.py "}
+  --ignore=entrypoints/openai/test_encoder_decoder.py \
+  --ignore=entrypoints/openai/test_embedding.py \
+  --ignore=entrypoints/openai/test_oot_registration.py "}
 fi

-#ignore certain Entrypoints/llm tests
-if [[ $commands == *" entrypoints/llm "* ]]; then
-  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
-  --ignore=entrypoints/llm/test_chat.py \
-  --ignore=entrypoints/llm/test_accuracy.py \
-  --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
-  --ignore=entrypoints/llm/test_prompt_validation.py "}
-fi
-
-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
-# --ignore=entrypoints/openai/test_encoder_decoder.py \
-# --ignore=entrypoints/openai/test_embedding.py \
-# --ignore=entrypoints/openai/test_oot_registration.py
-# --ignore=entrypoints/openai/test_accuracy.py \
-# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
-
-
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
@ -155,16 +114,13 @@ if [[ $commands == *"--shard-id="* ]]; then
    # assign shard-id for each shard
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    echo "Shard ${GPU} commands:$commands_gpu"
-    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
    docker run \
-        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-        --network=host \
+        --device /dev/kfd --device /dev/dri \
+        --network host \
        --shm-size=16gb \
        --rm \
        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
-        -e AWS_ACCESS_KEY_ID \
-        -e AWS_SECRET_ACCESS_KEY \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
        --name "${container_name}_${GPU}" \
@ -185,16 +141,13 @@ if [[ $commands == *"--shard-id="* ]]; then
    fi
  done
 else
-  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-          --network=host \
+          --device /dev/kfd --device /dev/dri \
+          --network host \
          --shm-size=16gb \
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
          -e HF_TOKEN \
-          -e AWS_ACCESS_KEY_ID \
-          -e AWS_SECRET_ACCESS_KEY \
          -v "${HF_CACHE}:${HF_MOUNT}" \
          -e "HF_HOME=${HF_MOUNT}" \
          --name "${container_name}" \
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -5,8 +5,8 @@
 set -ex
 set -o pipefail

-# cd 2 levels into the working directory
-cd "$(dirname "${BASH_SOURCE[0]}")/../.."
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."

 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

--- a/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-docker build -t cpu-test -f docker/Dockerfile.s390x .
+docker build -t cpu-test -f Dockerfile.ppc64le .
+
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -8,40 +8,34 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}

+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
+
 # Setup cleanup
-remove_docker_container() { 
-    set -e; 
-    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
-    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
-}
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
-
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2

 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
-  export BUILDKITE_BUILD_NUMBER=$3

  # offline inference
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/offline_inference/basic.py"

  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
+    pip install -r vllm/requirements-test.txt
    pytest -v -s tests/models/decoder_only/language -m cpu_model
    pytest -v -s tests/models/embedding/language -m cpu_model
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
@ -91,4 +85,4 @@ function cpu_tests() {

 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@ -9,13 +9,11 @@ python3 use_existing_torch.py

 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
-  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
-  --build-arg RUN_WHEEL_CHECK=false \
  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"

@ -25,6 +23,6 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and test offline inference
-docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference/basic.py
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -5,7 +5,7 @@
 set -ex

 # Try building the docker image
-docker build -t hpu-test-env -f docker/Dockerfile.hpu .
+docker build -t hpu-test-env -f Dockerfile.hpu .

 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT
 remove_docker_container

 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
 EXITCODE=$?
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@ -3,7 +3,7 @@
 set -euox pipefail

 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi

--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -25,17 +25,17 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
-        # Remove dangling images (those that are not tagged and not used by any container)
        docker image prune -f
-        # Remove unused volumes / force the system prune for old images as well.
-        docker volume prune -f && docker system prune -f
+        docker system prune -f
+        rm -rf "${HF_MOUNT:?}/*"
+        rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
        echo "$current_time" > /tmp/neuron-docker-build-timestamp
    fi
 else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi

-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .

 # Setup cleanup
 remove_docker_container() {
@ -44,11 +44,11 @@ remove_docker_container() {
 trap remove_docker_container EXIT

 # Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
+docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
       -v "${HF_CACHE}:${HF_MOUNT}" \
       -e "HF_HOME=${HF_MOUNT}" \
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+# This script build the OpenVINO docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t openvino-test -f Dockerfile.openvino .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f openvino-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t xpu-test -f Dockerfile.xpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f xpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference/tensor parallel
+docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
+    python3 examples/offline_inference/basic.py
+    python3 examples/offline_inference/cli.py -tp 2
+'
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -1,45 +0,0 @@
-#!/bin/bash
-
-# This script build the CPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Setup cleanup
-remove_docker_container() {
-  if [[ -n "$container_id" ]]; then
-      podman rm -f "$container_id" || true
-  fi
-  podman system prune -f
-}
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Try building the docker image
-podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
-
-# Run the image
-container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
-
-function cpu_tests() {
-
-  # offline inference
-  podman exec -it "$container_id" bash -c "
-    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
-
-  # Run basic model test
-  podman exec -it "$container_id" bash -c "
-    set -e
-    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
-    pip install sentence-transformers datamodel_code_generator
-    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
-}
-
-# All of CPU tests are expected to be finished less than 40 mins.
-
-export container_id
-export -f cpu_tests
-timeout 40m bash -c cpu_tests
-
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -1,54 +0,0 @@
-#!/bin/bash
-
-set -xue
-
-# Build the docker image.
-docker build -f docker/Dockerfile.tpu -t vllm-tpu .
-
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
-trap remove_docker_container EXIT
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# For HF_TOKEN.
-source /etc/environment
-# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest pytest-asyncio tpu-info \
-    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && export VLLM_XLA_CACHE_PATH= \
-    && export VLLM_USE_V1=1 \
-    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
-    && echo HARDWARE \
-    && tpu-info \
-    && echo TEST_0 \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
-    && echo TEST_1 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
-    && echo TEST_2 \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
-    && echo TEST_3 \
-    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
-    && echo TEST_4 \
-    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && echo TEST_5 \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
-    && echo TEST_6 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
-    && echo TEST_7 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
-    && echo TEST_8 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
-    && echo TEST_9 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
-    && echo TEST_10 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
-    && echo TEST_11 \
-    && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
-
-
-# TODO: This test fails because it uses RANDOM_SEED sampling
-# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -1,31 +0,0 @@
-#!/bin/bash
-
-# This script build the CPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
-container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-# Try building the docker image
-docker build -t ${image_name} -f docker/Dockerfile.xpu .
-
-# Setup cleanup
-remove_docker_container() { 
-  docker rm -f "${container_name}" || true; 
-  docker image rm -f "${image_name}" || true;
-  docker system prune -f || true;
-}
-trap remove_docker_container EXIT
-
-# Run the image and test offline inference/tensor parallel
-docker run \
-    --device /dev/dri \
-    -v /dev/dri/by-path:/dev/dri/by-path \
-    --entrypoint="" \
-    --name "${container_name}" \
-    "${image_name}" \
-    sh -c '
-    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
-'
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -2,13 +2,12 @@
 # adding a new command to an existing step. See different options here for examples.

 # This script will be feed into Jinja template in `test-template-aws.j2` at
-# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
 # to generate the final pipeline yaml file.

 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
@ -16,7 +15,7 @@
 # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
 # gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
 # num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
+# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, 
 #     in this case, commands must be specified. the first command runs on first host, the second
 #     command runs on the second host.
 # working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
@ -25,8 +24,8 @@
 # When adding a test
 # - If the test belong to an existing group, add it there
 # - If the test is short, add to any existing step
-# - If the test takes more than 10min, then it is okay to create a new step.
-#   Note that all steps execute in parallel.
+# - If the test takes more than 10min, then it is okay to create a new step. 
+#   Note that all steps execute in parallel. 

 steps:
 ##### fast check tests  #####
@ -36,12 +35,13 @@ steps:
  fast_check: true
  no_gpu: True
  commands:
-  - pip install -r ../../requirements/docs.txt
+  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html
  # Check API reference (if it fails, you may have missing mock imports)
  - grep \"sig sig-object py\" build/html/api/inference_params.html

 - label: Async Engine, Inputs, Utils, Worker Test # 24min
+  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
@ -50,9 +50,9 @@ steps:
  - tests/multimodal
  - tests/test_utils
  - tests/worker
-  - tests/standalone_tests/lazy_imports.py
+  - tests/standalone_tests/lazy_torch_compile.py
  commands:
-  - python3 standalone_tests/lazy_imports.py
+  - python3 standalone_tests/lazy_torch_compile.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@ -71,16 +71,12 @@ steps:
 - label: Basic Correctness Test # 30min
  #mirror_hardwares: [amd]
  fast_check: true
-  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
  - tests/basic_correctness/test_preemption
-  - tests/basic_correctness/test_cumem.py
  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
@ -106,73 +102,60 @@ steps:
 - label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
-  torch_nightly: true
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
-  - tests/entrypoints/llm
-  - tests/entrypoints/openai
-  - tests/entrypoints/test_chat_utils
-  - tests/entrypoints/offline_mode
  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
+  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
  - pytest -v -s entrypoints/test_chat_utils.py
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Distributed Tests (4 GPUs) # 10min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
+  fast_check: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
-  - tests/distributed/test_utils
-  - tests/distributed/test_pynccl
+  - tests/distributed
  - tests/spec_decode/e2e/test_integration_dist_tp4
-  - tests/compile/test_basic_correctness
+  - tests/compile
  - examples/offline_inference/rlhf.py
-  - examples/offline_inference/rlhf_colocate.py
-  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/test_async_llm_dp.py
  commands:
-  # test with tp=2 and external_dp=2
-  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with internal dp
-  - python3 ../examples/offline_inference/data_parallel.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
-  - pushd ../examples/offline_inference
-  - python3 rlhf.py
-  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-  - popd
+  - python3 ../examples/offline_inference/rlhf.py

 - label: Metrics, Tracing Test # 10min
-  mirror_hardwares: [amd]
-  num_gpus: 2
+  num_gpus: 2 
+  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/metrics
  - tests/tracing
  commands:
-  - pytest -v -s metrics
+  - pytest -v -s metrics 
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0,<1.27.0' \
+      'opentelemetry-api>=1.26.0,<1.27.0' \
+      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
  - pytest -v -s tracing

 ##### fast check tests  #####
 #####  1 GPU test  #####

 - label: Regression Test # 5min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@ -187,9 +170,6 @@ steps:
  - vllm/
  - tests/engine
  - tests/tokenization
-  - tests/test_sequence
-  - tests/test_config
-  - tests/test_logger
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
  # OOM in the CI unless we run this separately
@ -201,24 +181,7 @@ steps:
    - vllm/
    - tests/v1
  commands:
-    # split the test to avoid interference
-    - pytest -v -s v1/core
-    - pytest -v -s v1/engine
-    - pytest -v -s v1/entrypoints
-    - pytest -v -s v1/sample
-    - pytest -v -s v1/worker
-    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/spec_decode
-    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s v1/test_stats.py
-    - pytest -v -s v1/test_utils.py
-    - pytest -v -s v1/test_oracle.py
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
-    # Integration test for streaming correctness (requires special branch).
-    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
-    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+    - VLLM_USE_V1=1 pytest -v -s v1

 - label: Examples Test # 25min
  working_dir: "/vllm-workspace/examples"
@ -228,22 +191,19 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
+    - python3 offline_inference/basic.py
+    - python3 offline_inference/cpu_offload.py
+    - python3 offline_inference/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_embedding.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/vision_language.py
+    - python3 offline_inference/vision_language_multi_image.py
+    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
-    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
-    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/classification.py
+    - python3 offline_inference/embedding.py
+    - python3 offline_inference/scoring.py
+    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

 - label: Prefix Caching Test # 9min
  mirror_hardwares: [amd]
@ -270,7 +230,7 @@ steps:
  - vllm/model_executor/guided_decoding
  - tests/test_logits_processor
  - tests/model_executor/test_guided_processors
-  commands:
+  commands: 
    - pytest -v -s test_logits_processor.py
    - pytest -v -s model_executor/test_guided_processors.py

@ -281,27 +241,19 @@ steps:
  - vllm/model_executor/models/eagle.py
  commands:
    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
+    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py

 - label: LoRA Test %N # 15min each
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
  parallelism: 4

- label: PyTorch Compilation Unit Tests
-  source_file_dependencies:
-    - vllm/
-    - tests/compile
-  commands:
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_sequence_parallelism.py
-
- label: PyTorch Fullgraph Smoke Test # 9min
+- label: "PyTorch Fullgraph Smoke Test" # 9min
+  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@ -311,56 +263,25 @@ steps:
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py

- label: PyTorch Fullgraph Test # 18min
+- label: "PyTorch Fullgraph Test" # 18min
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py

- label: Kernels Core Operation Test
+- label: Kernels Test %N # 1h each
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
-  - tests/kernels/core
-  commands:
-    - pytest -v -s kernels/core
-
- label: Kernels Attention Test %N
-  source_file_dependencies:
-  - csrc/attention/
  - vllm/attention
-  - vllm/v1/attention
-  - tests/kernels/attention
+  - tests/kernels
  commands:
-    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
-
- label: Kernels Quantization Test %N
-  source_file_dependencies:
-  - csrc/quantization/
-  - vllm/model_executor/layers/quantization
-  - tests/kernels/quantization
-  commands:
-    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
-
- label: Kernels MoE Test
-  source_file_dependencies:
-  - csrc/moe/
-  - tests/kernels/moe
-  - vllm/model_executor/layers/fused_moe/
-  commands:
-    - pytest -v -s kernels/moe
-
- label: Kernels Mamba Test
-  source_file_dependencies:
-  - csrc/mamba/
-  - tests/kernels/mamba
-  commands:
-    - pytest -v -s kernels/mamba
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4

 - label: Tensorizer Test # 11min
-  # mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
@ -376,14 +297,7 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
-  - bash scripts/run-benchmarks.sh
-
- label: Benchmarks CLI Test # 10min
-  source_file_dependencies:
-  - vllm/
-  - tests/benchmarks/
-  commands:
-  - pytest -v -s benchmarks/
+  - bash run-benchmarks.sh

 - label: Quantization Test # 33min
  source_file_dependencies:
@ -401,14 +315,6 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-small.txt -t 1

- label: OpenAI API correctness
-  source_file_dependencies:
-  - csrc/
-  - vllm/entrypoints/openai/
-  - vllm/model_executor/models/whisper.py
-  commands: # LMEval+Transcription WER check
-  - pytest -s entrypoints/openai/correctness/
-
 - label: Encoder Decoder tests # 5min
  source_file_dependencies:
  - vllm/
@ -418,14 +324,12 @@ steps:

 - label: OpenAI-Compatible Tool Use # 20 min
  fast_check: false
-  #mirror_hardwares: [ amd ]
+  mirror_hardwares: [ amd ]
  source_file_dependencies:
    - vllm/
    - tests/tool_use
-    - tests/mistral_tool_use
  commands:
    - pytest -v -s tool_use
-    - pytest -v -s mistral_tool_use

 #####  models test  #####

@ -434,12 +338,8 @@ steps:
  - vllm/
  - tests/models
  commands:
-    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
-    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
+    - pytest -v -s models/test_initialization.py

 - label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
@ -449,8 +349,6 @@ steps:
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/language -m core_model

@ -462,8 +360,6 @@ steps:
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/language -m 'not core_model'

@ -480,12 +376,11 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal
    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/vision_language -m core_model
    - pytest -v -s models/encoder_decoder/audio_language -m core_model
    - pytest -v -s models/encoder_decoder/language -m core_model
    - pytest -v -s models/encoder_decoder/vision_language -m core_model
-    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py

 - label: Multi-Modal Models Test (Extended) 1 # 48m
  optional: true
@ -499,7 +394,10 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    # HACK - run phi3v tests separately to sidestep this transformers bug
+    # https://github.com/huggingface/transformers/issues/34307
+    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/vision_language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
@ -515,7 +413,6 @@ steps:

 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
-  mirror_hardwares: [amd]
  optional: true
  commands:
    - echo 'Testing custom models...'
@ -527,7 +424,6 @@ steps:
 #####  multi gpus test  #####

 - label: Distributed Comm Ops Test # 7min
-  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -570,32 +466,25 @@ steps:
  - vllm/worker/worker.py
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
-  - tests/v1/test_async_llm_dp.py
-  - vllm/v1/engine/
  commands:
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
-  # test sequence parallel
-  - pytest -v -s distributed/test_sequence_parallel.py
-  # this test fails consistently.
-  # TODO: investigate and fix
-  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py

 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
+  fast_check: true
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
@ -606,7 +495,6 @@ steps:
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
  # other tests continue here:
-  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
@ -627,9 +515,7 @@ steps:
  - vllm/engine
  - tests/multi_step
  commands:
-  # this test is quite flaky
-  # TODO: investigate and fix.
-  # - pytest -v -s multi_step/test_correctness_async_llm.py
+  - pytest -v -s multi_step/test_correctness_async_llm.py
  - pytest -v -s multi_step/test_correctness_llm.py

 - label: Pipeline Parallelism Test # 45min
@ -654,10 +540,13 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    # There is some Tensor Parallelism related processing logic in LoRA that
+    # This test runs llama 13B, so it is required to run on 4 GPUs.
+    - pytest -v -s -x lora/test_long_context.py
+    # There is some Tensor Parallelism related processing logic in LoRA that 
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py


 - label: Weight Loading Multiple GPU Test  # 33min
@ -678,7 +567,7 @@ steps:
  - vllm/
  - tests/weight_loading
  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 


 ##### multi gpus test #####
@ -690,7 +579,7 @@ steps:
  num_gpus: 4
  source_file_dependencies:
  - vllm/
-  commands:
+  commands: 
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -50,11 +50,8 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu12 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@ -66,11 +63,8 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu12 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi

--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,41 +2,32 @@
 # for more info about CODEOWNERS file

 # This lists cover the "core" components of vLLM that require careful review
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb
-/vllm/multimodal @DarkLight1337 @ywang96
-/vllm/vllm_flash_attn @LucasWilkinson
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
 CMakeLists.txt @tlrmchlsmth

 # vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb
+/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic

 # Test ownership
+/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
+/tests/test_inputs.py @DarkLight1337 @ywang96
+/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
+/tests/models @DarkLight1337 @ywang96
+/tests/multimodal @DarkLight1337 @ywang96
+/tests/prefix_caching @comaniac @KuntaiDu
+/tests/spec_decode @njhill @LiuXiaoxuanPKU
+/tests/kernels @tlrmchlsmth @WoosukKwon
+/tests/quantization @mgoin @robertgshaw2-neuralmagic
 /.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
-/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
-/tests/kernels @tlrmchlsmth @WoosukKwon
-/tests/model_executor/test_guided_processors.py @mgoin @russellb
-/tests/models @DarkLight1337 @ywang96
-/tests/multi_step @alexm-redhat @comaniac
-/tests/multimodal @DarkLight1337 @ywang96
-/tests/prefix_caching @comaniac @KuntaiDu
-/tests/quantization @mgoin @robertgshaw2-redhat
-/tests/spec_decode @njhill @LiuXiaoxuanPKU
-/tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
-/tests/v1/structured_output @mgoin @russellb
+/tests/multi_step @alexm-neuralmagic @comaniac
 /tests/weight_loading @mgoin @youkaichao
+/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
@ -30,6 +30,15 @@ body:
      </details>
  validations:
    required: true
+- type: textarea
+  attributes:
+    label: Model Input Dumps
+    description: |
+      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
+    placeholder: |
+      Upload the dumped input file.
+  validations:
+    required: false
 - type: textarea
  attributes:
    label: 🐛 Describe the bug
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).

-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@ -35,7 +35,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
@ -0,0 +1,28 @@
+name: 🎲 Misc/random discussions that do not fit into the above categories.
+description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
+title: "[Misc]: "
+labels: ["misc"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Anything you want to discuss about vllm.
+    description: >
+      Anything you want to discuss about vllm.
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
+        required: true
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,5 +1 @@
 blank_issues_enabled: false
-contact_links:
-  - name: Questions
-    url: https://discuss.vllm.ai
-    about: Ask questions and discuss with other vLLM community members
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -2,5 +2,4 @@ FILL IN THE PR DESCRIPTION HERE

 FIX #xxxx (*link existing issues this PR will resolve*)

-<!--- pyml disable-next-line no-emphasis-as-heading -->
-**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
+**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -23,7 +23,7 @@ updates:
      - dependency-name: "lm-format-enforcer"
      - dependency-name: "gguf"
      - dependency-name: "compressed-tensors"
-      - dependency-name: "ray[cgraph]" # Ray Compiled Graph
+      - dependency-name: "ray[adag]"
      - dependency-name: "lm-eval"
    groups:
      minor-update:
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -5,7 +5,6 @@ pull_request_rules:
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
-      - files~=^examples/
  actions:
    label:
      add:
@ -19,7 +18,7 @@ pull_request_rules:
      - files~=\.buildkite/
      - files~=^cmake/
      - files=CMakeLists.txt
-      - files~=^docker/Dockerfile
+      - files~=^Dockerfile
      - files~=^requirements.*\.txt
      - files=setup.py
  actions:
@ -36,118 +35,6 @@ pull_request_rules:
      add:
        - frontend

- name: label-multi-modality
-  description: Automatically apply multi-modality label
-  conditions:
-    - or:
-      - files~=^vllm/multimodal/
-      - files~=^tests/multimodal/
-      - files~=^tests/models/multimodal/
-      - files~=^tests/models/*/audio_language/
-      - files~=^tests/models/*/vision_language/
-      - files=tests/models/test_vision.py
-  actions:
-    label:
-      add:
-        - multi-modality
-
- name: label-structured-output
-  description: Automatically apply structured-output label
-  conditions:
-    - or:
-      - files~=^benchmarks/structured_schemas/
-      - files=benchmarks/benchmark_serving_structured_output.py
-      - files=benchmarks/run_structured_output_benchmark.sh
-      - files=docs/source/features/structured_outputs.md
-      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
-      - files~=^vllm/model_executor/guided_decoding/
-      - files=tests/model_executor/test_guided_processors.py
-      - files=tests/entrypoints/llm/test_guided_generate.py
-      - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_guided_generate.py
-      - files~=^vllm/v1/structured_output/
-  actions:
-    label:
-      add:
-        - structured-output
-
- name: label-speculative-decoding
-  description: Automatically apply speculative-decoding label
-  conditions:
-    - or:
-      - files~=^vllm/spec_decode/
-      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
-      - files~=^tests/spec_decode/
-  actions:
-    label:
-      add:
-        - speculative-decoding
-
- name: label-v1
-  description: Automatically apply v1 label
-  conditions:
-    - or:
-      - files~=^vllm/v1/
-      - files~=^tests/v1/
-  actions:
-    label:
-      add:
-        - v1
-
- name: label-tpu
-  description: Automatically apply tpu label
-  # Keep this list in sync with `label-tpu-remove` conditions
-  conditions:
-    - or:
-      - files~=tpu.py
-      - files~=_tpu
-      - files~=tpu_
-      - files~=/tpu/
-      - files~=pallas
-  actions:
-    label:
-      add:
-        - tpu
-
- name: label-tpu-remove
-  description: Automatically remove tpu label
-  # Keep this list in sync with `label-tpu` conditions
-  conditions:
-    - and:
-      - -files~=tpu.py
-      - -files~=_tpu
-      - -files~=tpu_
-      - -files~=/tpu/
-      - -files~=pallas
-  actions:
-    label:
-      remove:
-        - tpu
-
- name: label-tool-calling
-  description: Automatically add tool-calling label
-  conditions:
-    - or:
-      - files~=^tests/tool_use/
-      - files~=^tests/mistral_tool_use/
-      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
-      - files~=^vllm/entrypoints/openai/tool_parsers/
-      - files=docs/source/features/tool_calling.md
-      - files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
-      - files=docs/source/getting_started/examples/chat_with_tools.md
-      - files~=^examples/tool_chat_*
-      - files=examples/offline_inference/chat_with_tools.py
-      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
-      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
-      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
-  actions:
-    label:
-      add:
-        - tool-calling
-
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
      - conflict
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -16,7 +16,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Set up Python
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
        with:
          python-version: '3.12'

--- a/.github/workflows/dummy.yml
+++ b/.github/workflows/dummy.yml
@ -0,0 +1,20 @@
+name: dummy-checks
+
+on:
+  pull_request:
+
+jobs:
+  mypy:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - run: echo "This is a dummy step that always passes"
+  ruff:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - run: echo "This is a dummy step that always passes"
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -12,17 +12,17 @@ jobs:
          fetch-depth: 0

      - name: Set up Helm
-        uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
+        uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
        with:
          version: v3.14.4

       #Python is required because ct lint runs Yamale and yamllint which require Python.
-      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
        with:
          python-version: '3.13'

      - name: Set up chart-testing
-        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
+        uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
        with:
          version: v3.10.1

@ -47,10 +47,10 @@ jobs:
          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive

      - name: Create kind cluster
-        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
+        uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0

      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .

      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -10,11 +10,8 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+    - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-    - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-      with:
-        extra_args: --all-files --hook-stage manual
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -39,7 +39,7 @@ jobs:
            const script = require('.github/workflows/scripts/create_release.js')
            await script(github, context, core)

-  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
+  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
  # wheel:
  #   name: Build Wheel
  #   runs-on: ${{ matrix.os }}
@ -50,7 +50,7 @@ jobs:
  #     matrix:
  #         os: ['ubuntu-20.04']
  #         python-version: ['3.9', '3.10', '3.11', '3.12']
-  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
  #         cuda-version: ['11.8', '12.1']

  #   steps:
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -2,6 +2,7 @@ name: PR Reminder Comment Bot
 on:
  pull_request_target:
    types: [opened]
+
 jobs:
  pr_reminder:
    runs-on: ubuntu-latest
@ -14,12 +15,7 @@ jobs:
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
-                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
-                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
-                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
-                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
-                '🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
            })
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH

 # Install requirements
-$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
+$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt

 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
--- a/.github/workflows/scripts/create_release.js
+++ b/.github/workflows/scripts/create_release.js
@ -1,4 +1,4 @@
-// Uses GitHub's API to create the release and wait for result.
+// Uses Github's API to create the release and wait for result.
 // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.

 module.exports = async (github, context, core) => {
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -13,7 +13,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+      - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,7 @@
 /vllm/_version.py

 # vllm-flash-attn built from source
-vllm/vllm_flash_attn/*
+vllm/vllm_flash_attn/

 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -197,11 +197,8 @@ _build/
 hip_compat.h

 # Benchmark dataset
-benchmarks/**/*.json
+benchmarks/*.json

 # Linting
 actionlint
 shellcheck*/
-
-# Ingore moe/marlin_moe gen code
-csrc/moe/marlin_moe_wna16/kernel_*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,90 +1,62 @@
-default_install_hook_types:
-  - pre-commit
-  - commit-msg
-default_stages:
-  - pre-commit # Run locally
-  - manual # Run in CI
-exclude: 'vllm/third_party/.*'
 repos:
 - repo: https://github.com/google/yapf
-  rev: v0.43.0
+  rev: v0.32.0
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
+    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.9.3
+  rev: v0.6.5
  hooks:
  - id: ruff
-    args: [--output-format, github, --fix]
+    args: [--output-format, github]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.4.0
+  rev: v2.3.0
  hooks:
  - id: codespell
-    additional_dependencies: ['tomli']
-    args: ['--toml', 'pyproject.toml']
+    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
 - repo: https://github.com/PyCQA/isort
-  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
+  rev: 5.13.2
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v19.1.7
+  rev: v18.1.5
  hooks:
  - id: clang-format
-    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
+    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
 - repo: https://github.com/jackdewinter/pymarkdown
  rev: v0.9.27
  hooks:
  - id: pymarkdown
-    args: [fix]
- repo: https://github.com/rhysd/actionlint
-  rev: v1.7.7
-  hooks:
-  - id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.2
-  hooks:
-    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt]
-      files: ^requirements/test\.(in|txt)$
+    files: docs/.*
 - repo: local
  hooks:
-  - id: mypy-local
-    name: Run mypy for local Python installation
-    entry: tools/mypy.sh 0 "local"
-    language: python
-    types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
-    stages: [pre-commit] # Don't run in CI
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
    entry: tools/mypy.sh 1 "3.9"
    language: python
    types: [python]
-    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: tools/mypy.sh 1 "3.10"
    language: python
    types: [python]
    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.11
    entry: tools/mypy.sh 1 "3.11"
    language: python
    types: [python]
    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.12
    entry: tools/mypy.sh 1 "3.12"
    language: python
    types: [python]
    additional_dependencies: *mypy_deps
-    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
    entry: tools/shellcheck.sh
@ -95,43 +67,7 @@ repos:
    entry: tools/png-lint.sh
    language: script
    types: [png]
-  - id: signoff-commit
-    name: Sign-off Commit
-    entry: bash
-    args:
-      - -c
-      - |
-        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
-          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
-        fi
-    language: system
-    verbose: true
-    stages: [commit-msg]
-  - id: check-spdx-header
-    name: Check SPDX headers
-    entry: python tools/check_spdx_header.py
-    language: python
-    types: [python]
-  - id: check-filenames
-    name: Check for spaces in all filenames
-    entry: bash
-    args:
-      - -c
-      - 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
-    language: system
-    always_run: true
-    pass_filenames: false
-  - id: update-dockerfile-graph
-    name: Update Dockerfile dependency graph
-    entry: tools/update-dockerfile-graph.sh
-    language: script
-    files: ^docker/Dockerfile$
-    pass_filenames: false
-  # Keep `suggestion` last
-  - id: suggestion
-    name: Suggestion
-    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
-    language: system
-    verbose: true
-    pass_filenames: false
-  # Insert new entries above the `suggestion` entry
+- repo: https://github.com/rhysd/actionlint
+  rev: v1.7.6
+  hooks:
+  - id: actionlint
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -18,4 +18,4 @@ formats: []
 # Optionally declare the Python requirements required to build your docs
 python:
  install:
-    - requirements: requirements/docs.txt
+    - requirements: docs/requirements-docs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,6 +24,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")

+# Prevent installation of dependencies (cutlass) by default.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
@ -31,10 +34,10 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")

 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -44,10 +47,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from docker/Dockerfile.rocm
+# versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")

 #
 # Try to find python package with an executable that exactly matches
@ -174,54 +177,10 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")

-#
-# Set rocm version dev int.
-#
-if(VLLM_GPU_LANG STREQUAL "HIP")
-  #
-  # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
-  #
-  set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
-
-
-  #
-  # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
-  # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
-  #
-  set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
-endif()
-
 #
 # Define other extension targets
 #

-#
-# cumem_allocator extension
-#
-
-set(VLLM_CUMEM_EXT_SRC
-  "csrc/cumem_allocator.cpp")
-
-set_gencode_flags_for_srcs(
-  SRCS "${VLLM_CUMEM_EXT_SRC}"
-  CUDA_ARCHS "${CUDA_ARCHS}")
-
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  message(STATUS "Enabling cumem allocator extension.")
-  # link against cuda driver library
-  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
-  define_gpu_extension_target(
-    cumem_allocator
-    DESTINATION vllm
-    LANGUAGE CXX
-    SOURCES ${VLLM_CUMEM_EXT_SRC}
-    LIBRARIES ${CUMEM_LIBS}
-    USE_SABI 3.8
-    WITH_SOABI)
-endif()
-
 #
 # _C extension
 #
@ -230,12 +189,10 @@ set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
-  "csrc/attention/merge_attn_states.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
-  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
@ -243,15 +200,13 @@ set(VLLM_EXT_SRC
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
-  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  # Please keep this in sync with FetchContent_Declare line below.
-  set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -268,8 +223,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG v3.9.0
+        GIT_TAG v3.6.0
        GIT_PROGRESS TRUE

        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -285,13 +239,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/custom_all_reduce.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
-    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
-    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
-    "csrc/cutlass_extensions/common.cpp"
-    "csrc/attention/mla/cutlass_mla_entry.cu")
+    "csrc/sparse/cutlass/sparse_compressor_entry.cu"
+    "csrc/cutlass_extensions/common.cpp")

  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@ -300,7 +253,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
  if (MARLIN_ARCHS)
    set(MARLIN_SRCS
       "csrc/quantization/fp8/fp8_marlin.cu"
@ -320,87 +273,38 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                   " in CUDA target architectures")
  endif()

-  # Only build AllSpark kernels if we are building for at least some compatible archs.
-  cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
-  if (ALLSPARK_ARCHS)
-    set(ALLSPARK_SRCS
-       "csrc/quantization/gptq_allspark/allspark_repack.cu"
-       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${ALLSPARK_SRCS}"
-      CUDA_ARCHS "${ALLSPARK_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
-    message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
-  else()
-    message(STATUS "Not building AllSpark kernels as no compatible archs found"
-                   " in CUDA target architectures")
-  endif()
-
-
-  set(SCALED_MM_3X_ARCHS)
  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
-    set(SRCS
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
-    # Let scaled_mm_c2x know it doesn't need to build these arches
-    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
-    message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
+    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                     "later if you intend on running FP8 quantized models on "
                     "Hopper.")
    else()
-      message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
+      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
                     "in CUDA target architectures")
    endif()
-  endif()

-  # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS
-      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
-    )
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
-    # Let scaled_mm_c2x know it doesn't need to build these arches
-    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
-    message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
-                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
-                     "later if you intend on running FP8 quantized models on "
-                     "Blackwell.")
-    else()
-      message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
+    # build any 3x kernels
+    set(SCALED_MM_3X_ARCHS)
  endif()

  #
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
  # kernels for the remaining archs that are not already built for 3x.
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
+    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
@ -425,18 +329,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # 2:4 Sparse Kernels

  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper).
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
+  # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+    set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
-    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
                     "if you intend on running FP8 sparse quantized models on Hopper.")
@ -446,69 +350,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  # FP4 Archs and flags
-  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
-    set(SRCS
-      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${FP4_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
-    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
-  else()
-    message(STATUS "Not building NVFP4 as no compatible archs were found.")
-    # clear FP4_ARCHS
-    set(FP4_ARCHS)
-  endif()
-
-  # CUTLASS MLA Archs and flags
-  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
-    set(SRCS
-      "csrc/attention/mla/cutlass_mla_kernels.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${MLA_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
-    # Add MLA-specific include directories only to MLA source files
-    set_source_files_properties(${SRCS}
-      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
-    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
-  else()
-    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
-    # clear MLA_ARCHS
-    set(MLA_ARCHS)
-  endif()
-
-  # CUTLASS MoE kernels
-
-  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
-  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
-  # to compile MoE kernels that use its output.
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
-             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
-    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
-                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
-                     "if you intend on running FP8 quantized MoE models on Hopper.")
-    else()
-      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-  endif()

  #
  # Machete kernels
@ -590,8 +431,7 @@ define_gpu_extension_target(
  SOURCES ${VLLM_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
-  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)

@ -610,70 +450,28 @@ set(VLLM_MOE_EXT_SRC
  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
-endif()
-
 set_gencode_flags_for_srcs(
  SRCS "${VLLM_MOE_EXT_SRC}"
  CUDA_ARCHS "${CUDA_ARCHS}")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  set(VLLM_MOE_WNA16_SRC
-    "csrc/moe/moe_wna16.cu")
-
-  set_gencode_flags_for_srcs(
-    SRCS "${VLLM_MOE_WNA16_SRC}"
-    CUDA_ARCHS "${CUDA_ARCHS}")
-
-  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
+    set(MARLIN_MOE_SRC
+        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
+        "csrc/moe/marlin_moe_ops.cu")

-    #
-    # For the Marlin MOE kernels we automatically generate sources for various
-    # preselected input type pairs and schedules.
-    # Generate sources:
-    set(MOE_MARLIN_GEN_SCRIPT
-      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
-    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
-
-    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
-
-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
-      execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
-        RESULT_VARIABLE moe_marlin_generation_result
-        OUTPUT_VARIABLE moe_marlin_generation_output
-        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
-        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
-      )
-
-      if (NOT moe_marlin_generation_result EQUAL 0)
-        message(FATAL_ERROR "Marlin MOE generation failed."
-                            " Result: \"${moe_marlin_generation_result}\""
-                            "\nCheck the log for details: "
-                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
-      else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
-        message(STATUS "Marlin MOE generation completed successfully.")
-      endif()
-    else()
-      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
-    endif()
-
-    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
    set_gencode_flags_for_srcs(
-      SRCS "${MOE_WNAA16_MARLIN_SRC}"
+      SRCS "${MARLIN_MOE_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")

-    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
-
+    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
@ -698,7 +496,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  set(VLLM_ROCM_EXT_SRC
    "csrc/rocm/torch_bindings.cpp"
-    "csrc/rocm/skinny_gemms.cu"
    "csrc/rocm/attention.cu")

  define_gpu_extension_target(
@ -712,8 +509,79 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
    WITH_SOABI)
 endif()

-# For CUDA we also build and ship some external projects.
-if (VLLM_GPU_LANG STREQUAL "CUDA")
-    include(cmake/external_projects/flashmla.cmake)
-    include(cmake/external_projects/vllm_flash_attn.cmake)
+# vllm-flash-attn currently only supported on CUDA
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
+  return()
 endif ()
+
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
+# we need to manually set VLLM_GPU_ARCHES here.
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  foreach(_ARCH ${CUDA_ARCHS})
+    string(REPLACE "." "" _ARCH "${_ARCH}")
+    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
+  endforeach()
+endif()
+
+#
+# Build vLLM flash attention from source
+#
+# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
+# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
+# They should be identical but if they aren't, this is a massive footgun.
+#
+# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
+# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
+# If no component is specified, vllm-flash-attn is still installed.
+
+# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
+# This is to enable local development of vllm-flash-attn within vLLM.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
+  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
+endif()
+
+if(VLLM_FLASH_ATTN_SRC_DIR)
+  FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
+else()
+  FetchContent_Declare(
+          vllm-flash-attn
+          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
+          GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
+          GIT_PROGRESS TRUE
+          # Don't share the vllm-flash-attn build between build types
+          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
+  )
+endif()
+
+# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
+set(VLLM_PARENT_BUILD ON)
+
+# Ensure the vllm/vllm_flash_attn directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
+
+# Make sure vllm-flash-attn install rules are nested under vllm/
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
+install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
+
+# Fetch the vllm-flash-attn library
+FetchContent_MakeAvailable(vllm-flash-attn)
+message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
+
+# Restore the install prefix
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
+
+# Copy over the vllm-flash-attn python files
+install(
+        DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+        DESTINATION vllm/vllm_flash_attn
+        COMPONENT vllm_flash_attn_c
+        FILES_MATCHING PATTERN "*.py"
+)
+
+# Nothing after vllm-flash-attn, see comment about macros above
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -61,7 +61,7 @@ representative at an online or offline/IRL event.

 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement in the #code-of-conduct
-channel in the [vLLM Slack](https://slack.vllm.ai).
+channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
 All complaints will be reviewed and investigated promptly and fairly.

 All community leaders are obligated to respect the privacy and security of the
@ -125,3 +125,4 @@ Community Impact Guidelines were inspired by
 For answers to common questions about this code of conduct, see the
 [Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
 [Contributor Covenant translations](https://www.contributor-covenant.org/translations).
+
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -27,13 +27,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
-
-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
-ENV UV_HTTP_TIMEOUT=500

 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
@ -57,16 +50,15 @@ WORKDIR /workspace
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
    fi

-COPY requirements/common.txt requirements/common.txt
-COPY requirements/cuda.txt requirements/cuda.txt
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/cuda.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-cuda.txt

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -84,19 +76,15 @@ FROM base AS build
 ARG TARGETPLATFORM

 # install build dependencies
-COPY requirements/build.txt requirements/build.txt
+COPY requirements-build.txt requirements-build.txt

-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
-ENV UV_HTTP_TIMEOUT=500
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-build.txt

 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@ -110,7 +98,7 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 # if USE_SCCACHE is set, use sccache to speed up compilation
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
@ -130,19 +118,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \

 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git  \
    if [ "$USE_SCCACHE" != "1" ]; then \
-        # Clean any existing CMake artifacts
-        rm -rf .deps && \
-        mkdir -p .deps && \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi

 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
-# sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=400
+# Default max size of the wheel is 250MB
+ARG VLLM_MAX_SIZE_MB=250
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -155,24 +140,16 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
 #################### DEV IMAGE ####################
 FROM base as dev

-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
-ENV UV_HTTP_TIMEOUT=500
-
-COPY requirements/lint.txt requirements/lint.txt
-COPY requirements/test.txt requirements/test.txt
-COPY requirements/dev.txt requirements/dev.txt
-# Workaround for #17068
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt
+COPY requirements-lint.txt requirements-lint.txt
+COPY requirements-test.txt requirements-test.txt
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-dev.txt
 #################### DEV IMAGE ####################

 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-# TODO: Restore to base image after FlashInfer AOT wheel fixed
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
@ -196,13 +173,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
-
-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
-ENV UV_HTTP_TIMEOUT=500

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -214,46 +184,22 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
    fi

 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install dist/*.whl --verbose

-# If we need to build FlashInfer wheel before its release:
-# $ export FLASHINFER_ENABLE_AOT=1
-# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
-# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
-# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
-# $ cd flashinfer
-# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
-# $ rm -rf build
-# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
-# $ ls dist
-# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
-
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
 fi
 COPY examples examples
-COPY benchmarks benchmarks
-COPY ./vllm/collect_env.py .
-
-# Although we build Flashinfer with AOT mode, there's still
-# some issues w.r.t. JIT compilation. Therefore we need to
-# install build dependencies for JIT compilation.
-# TODO: Remove this once FlashInfer AOT wheel is fixed
-COPY requirements/build.txt requirements/build.txt
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
-
 #################### vLLM installation IMAGE ####################

 #################### TEST IMAGE ####################
@ -263,24 +209,17 @@ FROM vllm-base AS test

 ADD . /vllm-workspace/

-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
-ENV UV_HTTP_TIMEOUT=500
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-dev.txt

 # install development dependencies (for testing)
-# Workaround for #17068
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt
-
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -e tests/vllm_test_utils
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -e tests/vllm_test_utils

 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1

 # Copy in the v1 package for testing (it isn't distributed yet)
@ -297,18 +236,13 @@ RUN mv vllm test_docs/
 #################### OPENAI API SERVER ####################
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
-ARG TARGETPLATFORM
-
-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
-ENV UV_HTTP_TIMEOUT=500

 # install additional dependencies for openai api server
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    fi

 ENV VLLM_USAGE_SOURCE production-docker-image
--- a/docker/Dockerfile.arm
+++ b/docker/Dockerfile.arm
@ -26,18 +26,18 @@ WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
    pip install --upgrade pip && \
-    pip install -r requirements/build.txt
+    pip install -r requirements-build.txt

 FROM cpu-test-arm AS build

 WORKDIR /workspace/vllm

 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
-    pip install -v -r requirements/cpu.txt
+    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
+    pip install -v -r requirements-cpu.txt

 COPY . .
 ARG GIT_REPO_CHECK=0
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -0,0 +1,69 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-1
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
+# intel-openmp provides additional performance improvement vs. openmp
+# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install intel-openmp==2025.0.1
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+RUN pip install intel_extension_for_pytorch==2.5.0
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements-build.txt
+
+FROM cpu-test-1 AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
+    pip install -v -r requirements-cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -e tests/vllm_test_utils
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
@ -1,10 +1,10 @@
-FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest

 COPY ./ /workspace/vllm

 WORKDIR /workspace/vllm

-RUN pip install -v -r requirements/hpu.txt
+RUN pip install -v -r requirements-hpu.txt

 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"

 FROM $BASE_IMAGE

@ -21,14 +21,12 @@ VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm

 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
-RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
-RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
+RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install pytest

-# uninstall transformers-neuronx package explicitly to avoid version conflict
-RUN python3 -m pip uninstall -y transformers-neuronx
-
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@ -36,7 +34,7 @@ RUN --mount=type=bind,source=.git,target=.git \

 RUN python3 -m pip install -U \
        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements/neuron.txt
+        -r requirements-neuron.txt

 ENV VLLM_TARGET_DEVICE neuron
 RUN --mount=type=bind,source=.git,target=.git \
@ -45,10 +43,6 @@ RUN --mount=type=bind,source=.git,target=.git \
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils

-# install transformers-neuronx package as an optional dependencies (for V0)
-# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
-RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
-
 # overwrite entrypoint to run bash script
 RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py

--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -0,0 +1,29 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+FROM ubuntu:22.04 AS dev
+
+RUN apt-get update -y && \
+    apt-get install -y \
+        git python3-pip \
+        ffmpeg libsm6 libxext6 libgl1
+WORKDIR /workspace
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN python3 -m pip install -U pip
+# install build requirements
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
+# build vLLM with OpenVINO backend
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
+
+COPY examples/ /workspace/examples
+COPY benchmarks/ /workspace/benchmarks
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -0,0 +1,38 @@
+FROM mambaorg/micromamba
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+USER root
+
+ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
+
+RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
+
+# Some packages in requirements-cpu are installed here
+# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
+# Currently these may not be available for venv or pip directly
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
+RUN --mount=type=cache,target=/root/.cache/pip  \
+    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        torch==2.3.1 \
+        -r requirements-cpu.txt \
+        xformers uvloop==0.20.0
+
+RUN --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -0,0 +1,174 @@
+# Default ROCm 6.2 base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
+
+# Default ROCm ARCHes to build vLLM for.
+ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
+
+# Whether to install CK-based flash-attention
+# If 0, will not install flash-attention
+ARG BUILD_FA="1"
+ARG FA_GFX_ARCHS="gfx90a;gfx942"
+ARG FA_BRANCH="3cea2fb"
+
+# Whether to build triton on rocm
+ARG BUILD_TRITON="1"
+ARG TRITON_BRANCH="e192dba"
+
+### Base image build stage
+FROM $BASE_IMAGE AS base
+
+# Import arg(s) defined before this build stage
+ARG PYTORCH_ROCM_ARCH
+
+# Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
+RUN apt-get update && apt-get install -y \
+    curl \
+    ca-certificates \
+    sudo \
+    git \
+    bzip2 \
+    libx11-6 \
+    build-essential \
+    wget \
+    unzip \
+    tmux \
+    ccache \
+ && rm -rf /var/lib/apt/lists/*
+
+# When launching the container, mount the code directory to /vllm-workspace
+ARG APP_MOUNT=/vllm-workspace
+WORKDIR ${APP_MOUNT}
+
+RUN python3 -m pip install --upgrade pip
+# Remove sccache so it doesn't interfere with ccache
+# TODO: implement sccache support across components
+RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+
+# Install torch == 2.6.0 on ROCm
+RUN --mount=type=cache,target=/root/.cache/pip \
+    case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.2"*) \
+            python3 -m pip uninstall -y torch torchvision \
+            && python3 -m pip install --pre \
+                torch \
+                'setuptools-scm>=8' \
+                torchvision \
+                --extra-index-url https://download.pytorch.org/whl/rocm6.2;; \
+        *) ;; esac
+
+ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
+ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
+ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
+
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+ENV CCACHE_DIR=/root/.cache/ccache
+
+
+### AMD-SMI build stage
+FROM base AS build_amdsmi
+# Build amdsmi wheel always
+RUN cd /opt/rocm/share/amd_smi \
+    && python3 -m pip wheel . --wheel-dir=/install
+
+
+### Flash-Attention wheel build stage
+FROM base AS build_fa
+ARG BUILD_FA
+ARG FA_GFX_ARCHS
+ARG FA_BRANCH
+# Build ROCm flash-attention wheel if `BUILD_FA = 1`
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    if [ "$BUILD_FA" = "1" ]; then \
+        mkdir -p libs \
+        && cd libs \
+        && git clone https://github.com/ROCm/flash-attention.git \
+        && cd flash-attention \
+        && git checkout "${FA_BRANCH}" \
+        && git submodule update --init \
+        && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
+    # Create an empty directory otherwise as later build stages expect one
+    else mkdir -p /install; \
+    fi
+
+
+### Triton wheel build stage
+FROM base AS build_triton
+ARG BUILD_TRITON
+ARG TRITON_BRANCH
+# Build triton wheel if `BUILD_TRITON = 1`
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    if [ "$BUILD_TRITON" = "1" ]; then \
+    mkdir -p libs \
+    && cd libs \
+    && python3 -m pip install ninja cmake wheel pybind11 \
+    && git clone https://github.com/OpenAI/triton.git \
+    && cd triton \
+    && git checkout "${TRITON_BRANCH}" \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=/install; \
+    # Create an empty directory otherwise as later build stages expect one
+    else mkdir -p /install; \
+    fi
+
+
+### Final vLLM build stage
+FROM base AS final
+# Import the vLLM development directory from the build context
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN python3 -m pip install --upgrade pip
+
+# Package upgrades for useful functionality or to avoid dependency issues
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
+
+
+# Workaround for ray >= 2.10.0
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+# Silences the HF Tokenizers warning
+ENV TOKENIZERS_PARALLELISM=false
+
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    --mount=type=bind,source=.git,target=.git \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -Ur requirements-rocm.txt \
+    && python3 setup.py clean --all \
+    && python3 setup.py develop
+
+# Copy amdsmi wheel into final image
+RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
+    mkdir -p libs \
+    && cp /install/*.whl libs \
+    # Preemptively uninstall to avoid same-version no-installs
+    && python3 -m pip uninstall -y amdsmi;
+
+# Copy triton wheel(s) into final image if they were built
+RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
+    mkdir -p libs \
+    && if ls /install/*.whl; then \
+        cp /install/*.whl libs \
+        # Preemptively uninstall to avoid same-version no-installs
+        && python3 -m pip uninstall -y triton; fi
+
+# Copy flash-attn wheel(s) into final image if they were built
+RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
+    mkdir -p libs \
+    && if ls /install/*.whl; then \
+        cp /install/*.whl libs \
+        # Preemptively uninstall to avoid same-version no-installs
+        && python3 -m pip uninstall -y flash-attn; fi
+
+# Install wheels that were built to the final image
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if ls libs/*.whl; then \
+    python3 -m pip install libs/*.whl; fi
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+CMD ["/bin/bash"]
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20250124"
+ARG NIGHTLY_DATE="20241017"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"

 FROM $BASE_IMAGE
@ -15,14 +15,11 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi

-# Remove existing versions of dependencies
-RUN pip uninstall -y torch torch_xla torchvision
-
 ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
    python3 -m pip install \
-        -r requirements/tpu.txt
+        -r requirements-tpu.txt
 RUN python3 setup.py develop

 # install development dependencies (for testing)
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -0,0 +1,69 @@
+FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+    chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    ffmpeg \
+    git \
+    libsndfile1 \
+    libsm6 \
+    libxext6 \
+    libgl1 \
+    lsb-release \
+    numactl \
+    python3 \
+    python3-dev \
+    python3-pip \
+    # vim \
+    wget
+
+WORKDIR /workspace/vllm
+COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
+COPY requirements-common.txt /workspace/vllm/requirements-common.txt
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --no-cache-dir \
+    -r requirements-xpu.txt
+
+RUN git clone https://github.com/intel/pti-gpu && \
+    cd pti-gpu/sdk && \
+    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
+    mkdir build && \
+    cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
+    make -j && \
+    cmake --install . --config Release --prefix "/usr/local"
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+
+COPY . .
+ARG GIT_REPO_CHECK
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
+ENV VLLM_TARGET_DEVICE=xpu
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    python3 setup.py install
+
+CMD ["/bin/bash"]
+
+FROM vllm-base AS vllm-openai
+
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+
+ENV VLLM_USAGE_SOURCE production-docker-image \
+    TRITON_XPU_PROFILE 1
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +1,9 @@
 include LICENSE
-include requirements/common.txt
-include requirements/cuda.txt
-include requirements/rocm.txt
-include requirements/neuron.txt
-include requirements/cpu.txt
+include requirements-common.txt
+include requirements-cuda.txt
+include requirements-rocm.txt
+include requirements-neuron.txt
+include requirements-cpu.txt
 include CMakeLists.txt

 recursive-include cmake *
--- a/README.md
+++ b/README.md
@ -10,24 +10,17 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

 ---

+The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui)
+
+---
+
 *Latest News* 🔥
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
-
-<details>
-<summary>Previous News</summary>
-
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
 - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
@ -41,14 +34,11 @@ Easy, fast, and cheap LLM serving for everyone
 - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).

-</details>
-
 ---
 ## About
-
 vLLM is a fast and easy-to-use library for LLM inference and serving.

-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.

 vLLM is fast with:

@ -91,14 +81,14 @@ pip install vllm
 ```

 Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
+- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
 - [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
 - [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)

 ## Contributing

 We welcome and value any contributions and collaborations.
-Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
+Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.

 ## Sponsors

@ -121,7 +111,6 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
@ -140,7 +129,6 @@ We also have an official fundraising venue through [OpenCollective](https://open
 ## Citation

 If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
-
 ```bibtex
@inproceedings{kwon2023efficient,
  title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
@ -152,12 +140,12 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 ## Contact Us

- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
+* For technical questions and feature requests, please use Github issues or discussions.
+* For discussing with fellow users, please use Discord.
+* For coordinating contributions and development, please use Slack.
+* For security disclosures, please use Github's security advisory feature.
+* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.

 ## Media Kit

- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
+* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
--- a/RELEASE.md
+++ b/RELEASE.md
@ -1,54 +0,0 @@
-# Releasing vLLM
-
-vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
-
-## Release Versioning
-
-vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
-
-* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
-* _minor_ major features
-* _patch_ features and backwards-compatible bug fixes
-* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
-
-## Release Cadence
-
-Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
-Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
-
-| Release Date | Patch release versions | Post Release versions |
-| --- | --- | --- |
-| Jan 2025 | 0.7.0 | --- |
-| Feb 2025 | 0.7.1, 0.7.2, 0.7.3  | --- |
-| Mar 2025 | 0.7.4, 0.7.5 | --- |
-| Apr 2025 | 0.7.6, 0.7.7 | --- |
-| May 2025 | 0.7.8, 0.7.9 | --- |
-| Jun 2025 | 0.7.10, 0.7.11 | --- |
-| Jul 2025 | 0.7.12, 0.7.13 | --- |
-| Aug 2025 | 0.7.14, 0.7.15 | --- |
-| Sep 2025 | 0.7.16, 0.7.17 | --- |
-| Oct 2025 | 0.7.18, 0.7.19 | --- |
-| Nov 2025 | 0.7.20, 0.7.21 | --- |
-| Dec 2025 | 0.7.22, 0.7.23 | --- |
-
-## Release branch
-
-Each release is built from a dedicated release branch.
-
-* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
-* For post releases, previously cut release branch is reused
-* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
-* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
-* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
-
-## Release Cherry-Pick Criteria
-
-After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
-
-* Regression fixes - that address functional/performance regression against the most recent release (e.g. 0.7.0 for 0.7.1 release)
-* Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
-* Fixes to new features introduced in the most recent release (e.g. 0.7.0 for 0.7.1 release)
-* Documentation improvements
-* Release branch specific changes (e.g. change version identifiers or CI fixes)
-
-Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -1,343 +1,19 @@
 # Benchmarking vLLM

-This README guides you through running benchmark tests with the extensive
-datasets supported on vLLM. It’s a living document, updated as new features and datasets
-become available.
-
-## Dataset Overview
-
-<table style="width:100%; border-collapse: collapse;">
-  <thead>
-    <tr>
-      <th style="width:15%; text-align: left;">Dataset</th>
-      <th style="width:10%; text-align: center;">Online</th>
-      <th style="width:10%; text-align: center;">Offline</th>
-      <th style="width:65%; text-align: left;">Data Path</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td><strong>ShareGPT</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
-    </tr>
-    <tr>
-      <td><strong>BurstGPT</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
-    </tr>
-    <tr>
-      <td><strong>Sonnet</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td>Local file: <code>benchmarks/sonnet.txt</code></td>
-    </tr>
-    <tr>
-      <td><strong>Random</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>synthetic</code></td>
-    </tr>
-    <tr>
-      <td><strong>HuggingFace-VisionArena</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>lmarena-ai/VisionArena-Chat</code></td>
-    </tr>
-    <tr>
-      <td><strong>HuggingFace-InstructCoder</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>likaixin/InstructCoder</code></td>
-    </tr>
-      <tr>
-      <td><strong>HuggingFace-AIMO</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
-    </tr>
-    <tr>
-      <td><strong>HuggingFace-Other</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
-    </tr>
-  </tbody>
-</table>
-
-✅: supported
-
-🟡: Partial support
-
-🚧: to be supported
-
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
-
---
-## Example - Online Benchmark
-
-First start serving your model
+## Downloading the ShareGPT dataset

+You can download the dataset by running:
 ```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```

-Then run the benchmarking script
+## Downloading the ShareGPT4V dataset

+The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
+will ignore a datapoint if the referred image is missing.
 ```bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --endpoint /v1/completions \
-  --dataset-name sharegpt \
-  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --num-prompts 10
+wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
+mkdir coco -p
+wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
+unzip coco/train2017.zip -d coco/
 ```
-
-If successful, you will see the following output
-
-```
-============ Serving Benchmark Result ============
-Successful requests:                     10        
-Benchmark duration (s):                  5.78      
-Total input tokens:                      1369      
-Total generated tokens:                  2212      
-Request throughput (req/s):              1.73      
-Output token throughput (tok/s):         382.89    
-Total Token throughput (tok/s):          619.85    
---------------Time to First Token----------------
-Mean TTFT (ms):                          71.54     
-Median TTFT (ms):                        73.88     
-P99 TTFT (ms):                           79.49     
-----Time per Output Token (excl. 1st token)------
-Mean TPOT (ms):                          7.91      
-Median TPOT (ms):                        7.96      
-P99 TPOT (ms):                           8.03      
---------------Inter-token Latency----------------
-Mean ITL (ms):                           7.74      
-Median ITL (ms):                         7.70      
-P99 ITL (ms):                            8.39      
-==================================================
-```
-
-### VisionArena Benchmark for Vision Language Models
-
-```bash
-# need a model with vision capability here
-vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
-```
-
-```bash
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path lmarena-ai/VisionArena-Chat \
-  --hf-split train \
-  --num-prompts 1000
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
-    --speculative-model "[ngram]" \
-    --ngram_prompt_lookup_min 2 \
-    --ngram-prompt-lookup-max 5 \
-    --num_speculative_tokens 5
-```
-
-``` bash
-python3 benchmarks/benchmark_serving.py \
-    --model meta-llama/Meta-Llama-3-8B-Instruct \
-    --dataset-name hf \
-    --dataset-path likaixin/InstructCoder \
-    --num-prompts 2048
-```
-
-### Other HuggingFaceDataset Examples
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
-```
-
-**`lmms-lab/LLaVA-OneVision-Data`**
-
-```bash
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path lmms-lab/LLaVA-OneVision-Data \
-  --hf-split train \
-  --hf-subset "chart2text(cauldron)" \
-  --num-prompts 10
-```
-
-**`Aeala/ShareGPT_Vicuna_unfiltered`**
-
-```bash
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
-  --hf-split train \
-  --num-prompts 10
-```
-
-**`AI-MO/aimo-validation-aime`**
-
-``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
-    --model Qwen/QwQ-32B \
-    --dataset-name hf \
-    --dataset-path AI-MO/aimo-validation-aime \
-    --num-prompts 10 \
-    --seed 42
-```
-
-### Running With Sampling Parameters
-
-When using OpenAI-compatible backends such as `vllm`, optional sampling
-parameters can be specified. Example client command:
-
-```bash
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --endpoint /v1/completions \
-  --dataset-name sharegpt \
-  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --top-k 10 \
-  --top-p 0.9 \
-  --temperature 0.5 \
-  --num-prompts 10
-```
-
---
-## Example - Offline Throughput Benchmark
-
-```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --dataset-name sonnet \
-  --dataset-path vllm/benchmarks/sonnet.txt \
-  --num-prompts 10
-```
-
-If successful, you will see the following output
-
-```
-Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
-Total num prompt tokens:  5014
-Total num output tokens:  1500
-```
-
-### VisionArena Benchmark for Vision Language Models
-
-``` bash
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path lmarena-ai/VisionArena-Chat \
-  --num-prompts 1000 \
-  --hf-split train
-```
-
-The `num prompt tokens` now includes image token counts
-
-```
-Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
-Total num prompt tokens:  14527
-Total num output tokens:  1280
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_WORKER_MULTIPROC_METHOD=spawn \
-VLLM_USE_V1=1 \
-python3 vllm/benchmarks/benchmark_throughput.py \
-    --dataset-name=hf \
-    --dataset-path=likaixin/InstructCoder \
-    --model=meta-llama/Meta-Llama-3-8B-Instruct \
-    --input-len=1000 \
-    --output-len=100 \
-    --num-prompts=2048 \
-    --async-engine \
-    --speculative-model="[ngram]" \
-    --ngram_prompt_lookup_min=2 \
-    --ngram-prompt-lookup-max=5 \
-    --num_speculative_tokens=5
-```
-
-```
-Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
-Total num prompt tokens:  261136
-Total num output tokens:  204800
-```
-
-### Other HuggingFaceDataset Examples
-
-**`lmms-lab/LLaVA-OneVision-Data`**
-
-```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path lmms-lab/LLaVA-OneVision-Data \
-  --hf-split train \
-  --hf-subset "chart2text(cauldron)" \
-  --num-prompts 10
-```
-
-**`Aeala/ShareGPT_Vicuna_unfiltered`**
-
-```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
-  --hf-split train \
-  --num-prompts 10
-```
-
-**`AI-MO/aimo-validation-aime`**
-
-```bash
-python3 benchmarks/benchmark_throughput.py \
-  --model Qwen/QwQ-32B \
-  --backend vllm \
-  --dataset-name hf \
-  --dataset-path AI-MO/aimo-validation-aime \
-  --hf-split train \
-  --num-prompts 10
-```
-
-### Benchmark with LoRA Adapters
-
-``` bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model meta-llama/Llama-2-7b-hf \
-  --backend vllm \
-  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --dataset_name sharegpt \
-  --num-prompts 10 \
-  --max-loras 2 \
-  --max-lora-rank 8 \
-  --enable-lora \
-  --lora-path yard1/llama-2-7b-sql-lora-test
-  ```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -1,13 +1,10 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import io
 import json
 import os
 import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import Optional, Union
+from typing import List, Optional, Union

 import aiohttp
 import huggingface_hub.constants
@ -15,9 +12,6 @@ from tqdm.asyncio import tqdm
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)

-# NOTE(simon): do not import vLLM here so the benchmark script
-# can run without vLLM installed.
-
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)


@ -29,11 +23,11 @@ class RequestFuncInput:
    output_len: int
    model: str
    model_name: Optional[str] = None
+    best_of: int = 1
    logprobs: Optional[int] = None
    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False
-    language: Optional[str] = None


@dataclass
@ -41,10 +35,9 @@ class RequestFuncOutput:
    generated_text: str = ""
    success: bool = False
    latency: float = 0.0
-    output_tokens: int = 0
    ttft: float = 0.0  # Time to first token
-    itl: list[float] = field(
-        default_factory=list)  # list of inter-token latencies
+    itl: List[float] = field(
+        default_factory=list)  # List of inter-token latencies
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""
@ -57,15 +50,15 @@ async def async_request_tgi(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        params = {
+            "best_of": request_func_input.best_of,
            "max_new_tokens": request_func_input.output_len,
            "do_sample": True,
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
            "truncate": request_func_input.prompt_len,
-            "ignore_eos_token": request_func_input.ignore_eos,
+            # TGI does not accept ignore_eos flag.
        }
        payload = {
            "inputs": request_func_input.prompt,
@ -73,10 +66,6 @@ async def async_request_tgi(
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
-        if request_func_input.ignore_eos:
-            output.output_tokens = request_func_input.output_len
-        else:
-            output.output_tokens = None

        ttft = 0.0
        st = time.perf_counter()
@ -133,8 +122,8 @@ async def async_request_trt_llm(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert request_func_input.best_of == 1
        payload = {
            "accumulate_tokens": True,
            "text_input": request_func_input.prompt,
@ -167,7 +156,7 @@ async def async_request_trt_llm(
                        timestamp = time.perf_counter()
                        # First token
                        if ttft == 0.0:
-                            ttft = timestamp - st
+                            ttft = time.perf_counter() - st
                            output.ttft = ttft

                        # Decoding phase
@ -197,8 +186,8 @@ async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert request_func_input.best_of == 1

        payload = {
            "prompt": request_func_input.prompt,
@ -221,15 +210,7 @@ async def async_request_deepspeed_mii(
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
-                    if "choices" in parsed_resp:
-                        output.generated_text = parsed_resp["choices"][0][
-                            "text"]
-                    elif "text" in parsed_resp:
-                        output.generated_text = parsed_resp["text"][0]
-                    else:
-                        output.error = ("Unexpected response format: "
-                                        "neither 'choices' nor 'text' found")
-                        output.success = False
+                    output.generated_text = parsed_resp["text"][0]
                    output.success = True
                else:
                    output.error = response.reason or ""
@ -253,22 +234,18 @@ async def async_request_openai_completions(
        ("completions", "profile")
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
            "model": request_func_input.model_name \
                if request_func_input.model_name else request_func_input.model,
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
+            "best_of": request_func_input.best_of,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
-            "stream_options": {
-                "include_usage": True,
-            },
+            "ignore_eos": request_func_input.ignore_eos,
        }
-        if request_func_input.ignore_eos:
-            payload["ignore_eos"] = request_func_input.ignore_eos
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {
@ -279,6 +256,7 @@ async def async_request_openai_completions(
        output.prompt_len = request_func_input.prompt_len

        generated_text = ""
+        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
@ -293,16 +271,15 @@ async def async_request_openai_completions(

                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
-                        if chunk != "[DONE]":
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
                            data = json.loads(chunk)

                            # NOTE: Some completion API might have a last
                            # usage summary response without a token so we
                            # want to check a token was generated
-                            if choices := data.get("choices"):
-                                # Note that text could be empty here
-                                # e.g. for special tokens
-                                text = choices[0].get("text")
+                            if data["choices"][0]["text"]:
                                timestamp = time.perf_counter()
                                # First token
                                if not first_chunk_received:
@ -316,10 +293,7 @@ async def async_request_openai_completions(
                                                      most_recent_timestamp)

                                most_recent_timestamp = timestamp
-                                generated_text += text or ""
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                                generated_text += data["choices"][0]["text"]
                    if first_chunk_received:
                        output.success = True
                    else:
@ -328,7 +302,7 @@ async def async_request_openai_completions(
                            "Never received a valid chunk to calculate TTFT."
                            "This response will be marked as failed!")
                    output.generated_text = generated_text
-                    output.latency = most_recent_timestamp - st
+                    output.latency = latency
                else:
                    output.error = response.reason or ""
                    output.success = False
@ -348,11 +322,10 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        ("chat/completions", "profile")
+        "chat/completions"
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
@ -368,12 +341,8 @@ async def async_request_openai_chat_completions(
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
-            "stream_options": {
-                "include_usage": True,
-            },
+            "ignore_eos": request_func_input.ignore_eos,
        }
-        if request_func_input.ignore_eos:
-            payload["ignore_eos"] = request_func_input.ignore_eos
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {
@ -399,15 +368,17 @@ async def async_request_openai_chat_completions(

                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
-                        if chunk != "[DONE]":
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)

-                            if choices := data.get("choices"):
-                                content = choices[0]["delta"].get("content")
+                            delta = data["choices"][0]["delta"]
+                            if delta.get("content", None):
                                # First token
                                if ttft == 0.0:
-                                    ttft = timestamp - st
+                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft

                                # Decoding phase
@ -415,16 +386,13 @@ async def async_request_openai_chat_completions(
                                    output.itl.append(timestamp -
                                                      most_recent_timestamp)

-                                generated_text += content or ""
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                                generated_text += delta["content"]

                            most_recent_timestamp = timestamp

                    output.generated_text = generated_text
                    output.success = True
-                    output.latency = most_recent_timestamp - st
+                    output.latency = latency
                else:
                    output.error = response.reason or ""
                    output.success = False
@ -438,125 +406,16 @@ async def async_request_openai_chat_completions(
    return output


-async def async_request_openai_audio(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    # Lazy import without PlaceholderModule to avoid vllm dep.
-    import soundfile
-    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("transcriptions", "translations"
-         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
-    "or `translations`."
-
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
-        content = [{"type": "text", "text": request_func_input.prompt}]
-        payload = {
-            "model": request_func_input.model_name \
-                if request_func_input.model_name else request_func_input.model,
-            "temperature": 0.0,
-            "max_completion_tokens": request_func_input.output_len,
-            "stream": True,
-            "language": "en",
-            # Flattened due to multipart/form-data
-            "stream_include_usage": True,
-            "stream_continuous_usage_stats": True
-        }
-        if request_func_input.extra_body:
-            payload.update(request_func_input.extra_body)
-        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        }
-
-        # Send audio file
-        def to_bytes(y, sr):
-            buffer = io.BytesIO()
-            soundfile.write(buffer, y, sr, format="WAV")
-            buffer.seek(0)
-            return buffer
-
-        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
-            form = aiohttp.FormData()
-            form.add_field('file', f, content_type='audio/wav')
-            for key, value in payload.items():
-                form.add_field(key, str(value))
-
-            output = RequestFuncOutput()
-            output.prompt_len = request_func_input.prompt_len
-
-            generated_text = ""
-            ttft = 0.0
-            st = time.perf_counter()
-            most_recent_timestamp = st
-            try:
-                async with session.post(url=api_url,
-                                        data=form,
-                                        headers=headers) as response:
-                    if response.status == 200:
-                        async for chunk_bytes in response.content:
-                            chunk_bytes = chunk_bytes.strip()
-                            if not chunk_bytes:
-                                continue
-
-                            chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                "data: ")
-                            if chunk != "[DONE]":
-                                timestamp = time.perf_counter()
-                                data = json.loads(chunk)
-
-                                if choices := data.get("choices"):
-                                    content = choices[0]["delta"].get(
-                                        "content")
-                                    # First token
-                                    if ttft == 0.0:
-                                        ttft = timestamp - st
-                                        output.ttft = ttft
-
-                                    # Decoding phase
-                                    else:
-                                        output.itl.append(
-                                            timestamp - most_recent_timestamp)
-
-                                    generated_text += content or ""
-                                elif usage := data.get("usage"):
-                                    output.output_tokens = usage.get(
-                                        "completion_tokens")
-
-                                most_recent_timestamp = timestamp
-
-                        output.generated_text = generated_text
-                        output.success = True
-                        output.latency = most_recent_timestamp - st
-                    else:
-                        output.error = response.reason or ""
-                        output.success = False
-            except Exception:
-                output.success = False
-                exc_info = sys.exc_info()
-                output.error = "".join(traceback.format_exception(*exc_info))
-
-        if pbar:
-            pbar.update(1)
-        return output
-
-
 def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download

-        from vllm.model_executor.model_loader.weight_utils import get_lock
+        model_path = snapshot_download(
+            model_id=pretrained_model_name_or_path,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])

-        # Use file lock to prevent multiple processes from
-        # downloading the same model weights at the same time.
-        with get_lock(pretrained_model_name_or_path):
-            model_path = snapshot_download(
-                model_id=pretrained_model_name_or_path,
-                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
-
-            return model_path
+        return model_path
    return pretrained_model_name_or_path


@ -599,14 +458,7 @@ ASYNC_REQUEST_FUNCS = {
    "deepspeed-mii": async_request_deepspeed_mii,
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
-    "openai-audio": async_request_openai_audio,
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
 }
-
-OPENAI_COMPATIBLE_BACKENDS = [
-    k for k, v in ASYNC_REQUEST_FUNCS.items()
-    if v in (async_request_openai_completions,
-             async_request_openai_chat_completions)
-]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Roger Wang	a7ca0cc47f	Merge branch 'main' into moondream2	2025-01-20 08:10:52 +00:00
Roger Wang	d789ce06a7	moondream text model Signed-off-by: Roger Wang <ywang@roblox.com>	2025-01-10 06:12:27 +00:00