uint64

fix
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-10-30 16:54:10 -07:00 · 2025-10-30 16:40:09 -07:00 · 2025-10-30 16:38:16 -07:00 · 2025-10-30 16:30:06 -07:00 · 2025-10-30 22:19:50 +00:00 · 2025-10-30 13:11:29 -07:00
1414 changed files with 58561 additions and 60722 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -5,11 +5,11 @@ import os
 import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
 # Note that we have 800 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))


 def print_top_10_largest_files(zip_file):
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.419
+  - name: "exact_match,flexible-extract"
+    value: 0.416
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
+    value: 0.80
+limit: 100
+num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -0,0 +1,10 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+tasks:
+- name: "mmlu_pro"
+  metrics:
+  - name: "exact_match,custom-extract"
+    value: 0.80
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+# For vllm script, with -t option (tensor parallel size)
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
+
+model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    value: 0.855
+limit: 2500
+num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
@ -0,0 +1,14 @@
+model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
+tasks:
+  - name: "mmlu_pro"
+    metrics:
+      - name: "exact_match,custom-extract"
+        value: 0.82
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
+enforce_eager: false # we use false to speed up the eval process
+kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
+max_model_len: 40960
+apply_chat_template: true
+fewshot_as_multiturn: true
+gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@ -0,0 +1 @@
+Qwen3-235B-A22B-Instruct-2507-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@ -0,0 +1 @@
+Qwen2.5-VL-7B-Instruct.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on chartqa for vllm.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.9
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on ChartQA using multimodal vllm."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our correctness tests in vllm's CI."
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:l:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    t ) 
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm-vlm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
+  --tasks chartqa \
+  --batch_size auto \
+  --apply_chat_template \
+  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m )
+        MODEL="$OPTARG"
+        ;;
+    b )
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l )
+        LIMIT="$OPTARG"
+        ;;
+    f )
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size auto
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -19,21 +19,35 @@ RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
+    batch_size = eval_config.get("batch_size", "auto")
+    backend = eval_config.get("backend", "vllm")
+    enforce_eager = eval_config.get("enforce_eager", "true")
+    kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
-        f"enforce_eager=true,"
+        f"enforce_eager={enforce_eager},"
+        f"kv_cache_dtype={kv_cache_dtype},"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len}"
+        f"max_model_len={max_model_len},"
    )
    results = lm_eval.simple_evaluate(
-        model="vllm",
+        model=backend,
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        batch_size="auto",
+        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+        # text models. however, this is regressing measured strict-match for
+        # existing text models in CI, so only apply it for mm, or explicitly set
+        apply_chat_template=eval_config.get(
+            "apply_chat_template", backend == "vllm-vlm"
+        ),
+        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+        gen_kwargs=eval_config.get("gen_kwargs"),
+        batch_size=batch_size,
    )
    return results

--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -1,184 +0,0 @@
-steps:
-  - label: "Wait for container to be ready"
-    key: wait-for-container-image
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: badouralix/curl-jq
-            command:
-            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-  - label: "Cleanup H100"
-    agents:
-      queue: H100
-    depends_on: ~
-    command: docker system prune -a --volumes --force
-  
-  - label: "A100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: A100
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-
-  - label: "H200"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H200
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: 4,5,6,7
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  #- block: "Run H100 Benchmark"
-    #key: block-h100
-    #depends_on: ~
-
-  - label: "H100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H100
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  # Premerge benchmark
-  - label: "A100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: A100
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-
-  - label: "H200"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H200
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: 4,5,6,7
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  #- block: "Run H100 Benchmark"
-    #key: block-h100
-    #depends_on: ~
-
-  - label: "H100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H100
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -1,28 +0,0 @@
-# Nightly benchmark annotation
-
-## Description
-
-This file contains the downloading link for benchmarking results.
-
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
- [benchmarking results](artifact://results.zip)
- [benchmarking code](artifact://nightly-benchmarks.zip)
-
-Please download the visualization scripts in the post
-
-## Results reproduction
-
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
-    - Download `nightly-benchmarks.zip`.
-    - In the same folder, run the following code:
-
-    ```bash
-    export HF_TOKEN=<your HF token>
-    apt update
-    apt install -y git
-    unzip nightly-benchmarks.zip
-    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-    ```
-
-And the results will be inside `./benchmarks/results`.
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -1,39 +0,0 @@
-
-# Nightly benchmark
-
-This benchmark aims to:
-
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
-
-Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-
-Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
-
-## Setup
-
- Docker images:
-    - vLLM: `vllm/vllm-openai:v0.6.2`
-    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
-    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
-    - 8x Nvidia A100 GPUs
- Workload:
-    - Dataset
-        - ShareGPT dataset
-        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
-    - Models: llama-3 8B, llama-3 70B.
-        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
-    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
-
-## Known issues
-
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
- TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@ -1,196 +0,0 @@
-common_pod_spec: &common_pod_spec
-  priorityClassName: perf-benchmark
-  nodeSelector:
-    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  volumes:
-    - name: devshm
-      emptyDir:
-        medium: Memory
-    - name: hf-cache
-      hostPath:
-        path: /root/.cache/huggingface
-        type: Directory
-
-common_container_settings: &common_container_settings
-  command:
-    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-  resources:
-    limits:
-      nvidia.com/gpu: 8
-  volumeMounts:
-    - name: devshm
-      mountPath: /dev/shm
-    - name: hf-cache
-      mountPath: /root/.cache/huggingface
-  env:
-    - name: VLLM_USAGE_SOURCE
-      value: ci-test
-    - name: HF_HOME
-      value: /root/.cache/huggingface
-    - name: VLLM_SOURCE_CODE_LOC
-      value: /workspace/build/buildkite/vllm/performance-benchmark
-    - name: HF_TOKEN
-      valueFrom:
-        secretKeyRef:
-          name: hf-token-secret
-          key: token
-
-steps:
-  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-
-
-
-  - label: "A100 vllm step 10"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: vllm/vllm-openai:v0.6.2
-                <<: *common_container_settings
-
-
-
-  - label: "A100 sglang benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: lmsysorg/sglang:v0.3.2-cu121
-                <<: *common_container_settings
-
-  - label: "A100 lmdeploy benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: openmmlab/lmdeploy:v0.6.1-cu12
-                <<: *common_container_settings
-
-
-
-
-  - label: "A100 trt llama-8B"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama8B"
-
-
-  - label: "A100 trt llama-70B"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama70B"
-
-
-  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
-  # - label: "A100 trt benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-  #               <<: *common_container_settings
-
-
-  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
-  # - label: "A100 tgi benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
-  #               <<: *common_container_settings
-        
-  - wait
-
-  - label: "Collect the results"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-            - image: vllm/vllm-openai:v0.5.0.post1
-              command:
-              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
-              resources:
-                limits:
-                  nvidia.com/gpu: 8
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: VLLM_SOURCE_CODE_LOC
-                value: /workspace/build/buildkite/vllm/performance-benchmark
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-
-  - block: ":rocket: check the results!"
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -1,26 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-
-from transformers import AutoTokenizer
-
-
-def main(model, cachedir):
-    # Load the tokenizer and save it to the specified directory
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    tokenizer.save_pretrained(cachedir)
-    print(f"Tokenizer saved to {cachedir}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Download and save Hugging Face tokenizer"
-    )
-    parser.add_argument("--model", type=str, required=True, help="Name of the model")
-    parser.add_argument(
-        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
-    )
-
-    args = parser.parse_args()
-    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -1,97 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import json
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-from tabulate import tabulate
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description="Parse command line arguments for summary-nightly-results script."
-    )
-    parser.add_argument(
-        "--results-folder",
-        type=str,
-        required=True,
-        help="The folder where the results are stored.",
-    )
-    parser.add_argument(
-        "--description", type=str, required=True, help="Description of the results."
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-def get_perf(df, method, model, metric):
-    means = []
-
-    for qps in [2, 4, 8, 16, "inf"]:
-        target = df["Test name"].str.contains(model)
-        target = target & df["Engine"].str.contains(method)
-        target = target & df["Test name"].str.contains("qps_" + str(qps))
-        filtered_df = df[target]
-
-        if filtered_df.empty:
-            means.append(0.0)
-        else:
-            means.append(filtered_df[metric].values[0])
-
-    return np.array(means)
-
-
-def get_perf_w_std(df, method, model, metric):
-    if metric in ["TTFT", "ITL"]:
-        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
-        mean = mean.tolist()
-        std = get_perf(df, method, model, "Std " + metric + " (ms)")
-        if std.mean() == 0:
-            std = None
-        success = get_perf(df, method, model, "Successful req.")
-        if std is not None:
-            std = std / np.sqrt(success)
-            std = std.tolist()
-
-    else:
-        assert metric == "Tput"
-        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)"
-        )
-        mean = mean.tolist()
-        std = None
-
-    return mean, std
-
-
-def main(args):
-    results_folder = Path(args.results_folder)
-
-    results = []
-
-    # collect results
-    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file) as f:
-            results = results + json.loads(f.read())
-
-    # generate markdown table
-    df = pd.DataFrame.from_dict(results)
-
-    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
-
-    with open(args.description) as f:
-        description = f.read()
-
-    description = description.format(nightly_results_benchmarking_table=md_table)
-
-    with open("nightly_results.md", "w") as f:
-        f.write(description)
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -1,9 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from lmdeploy.serve.openai.api_client import APIClient
-
-api_client = APIClient("http://localhost:8000")
-model_name = api_client.available_models[0]
-
-print(model_name)
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@ -1,78 +0,0 @@
-#!/bin/bash
-
-set -ex
-set -o pipefail
-
-
-main() {
-
-    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-    (which jq) || (apt-get update && apt-get -y install jq)
-    (which zip) || (apt-get install -y zip)
-
-    if [ ! -f /workspace/buildkite-agent ]; then
-        echo "buildkite-agent binary not found. Skip plotting the results."
-        exit 0
-    fi
-
-    # initial annotation
-    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
-
-    # download results
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-    mkdir -p results/
-    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
-    ls
-    ls results/
-
-    # upload benchmark results
-    zip -r results.zip results/
-    /workspace/buildkite-agent artifact upload "results.zip"
-
-    # upload benchmarking scripts
-    cd "$VLLM_SOURCE_CODE_LOC/"
-    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
-    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
-
-    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-    # upload benchmarking pipeline
-    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
-
-    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
-    
-
-
-    # The figures should be generated by a separate process outside the CI/CD pipeline
-
-    # # generate figures
-    # python3 -m pip install tabulate pandas matplotlib
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
-    #     --description $description \
-    #     --results-folder results/ 
-
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sharegpt
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_2048_128
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_128_2048
-    
-    # # upload results and figures
-    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
-    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -1,464 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -x
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
-  echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-  # check if HF_TOKEN is available and valid
-  if [[ -z "$HF_TOKEN" ]]; then
-    echo "Error: HF_TOKEN is not set."
-    exit 1
-  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-    echo "Error: HF_TOKEN does not start with 'hf_'."
-    exit 1
-  else
-    echo "HF_TOKEN is set and valid."
-  fi
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-get_current_llm_serving_engine() {
-
-  if which lmdeploy >/dev/null; then
-    echo "Container: lmdeploy"
-    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
-    return
-  fi
-
-  if [ -e /tgi-entrypoint.sh ]; then
-    echo "Container: tgi"
-    export CURRENT_LLM_SERVING_ENGINE=tgi
-    return
-  fi
-
-  if which trtllm-build >/dev/null; then
-    echo "Container: tensorrt-llm"
-    export CURRENT_LLM_SERVING_ENGINE=trt
-    return
-  fi
-
-  if [ -e /sgl-workspace ]; then
-    echo "Container: sglang"
-    export CURRENT_LLM_SERVING_ENGINE=sglang
-    return
-  fi
-
-  if [ -e /vllm-workspace ]; then
-    echo "Container: vllm"
-    # move to a completely irrelevant directory, to avoid import vllm from current folder
-    export CURRENT_LLM_SERVING_ENGINE=vllm
-
-    return
-  fi
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-kill_gpu_processes() {
-  pkill -f '[p]ython'
-  pkill -f '[p]ython3'
-  pkill -f '[t]ritonserver'
-  pkill -f '[p]t_main_thread'
-  pkill -f '[t]ext-generation'
-  pkill -f '[l]mdeploy'
-  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
-  pkill -f '[V]LLM'
-
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    sleep 1
-  done
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-ensure_installed() {
-  # Ensure that the given command is installed by apt-get
-  local cmd=$1
-  if ! which "$cmd" >/dev/null; then
-    apt-get update && apt-get install -y "$cmd"
-  fi
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
-        "$server_params" "$common_params"
-    fi
-
-    if wait_for_server; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # prepare tokenizer
-    # this is required for lmdeploy.
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-
-
-    # change model name for lmdeploy (it will not follow standard hf name)
-    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
-      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      backend=$CURRENT_LLM_SERVING_ENGINE
-
-      if [[ $backend = "trt" ]]; then
-        backend="tensorrt-llm"
-      fi
-
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-
-      if [[ "$dataset_name" = "sharegpt" ]]; then
-
-        client_command="vllm bench serve \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      elif [[ "$dataset_name" = "sonnet" ]]; then
-
-        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
-        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
-        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
-
-        client_command="vllm bench serve \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --sonnet-input-len $sonnet_input_len \
-          --sonnet-output-len $sonnet_output_len \
-          --sonnet-prefix-len $sonnet_prefix_len \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      else
-
-        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
-        exit 1
-
-      fi
-
-
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      server_command="None"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-  done
-
-  kill_gpu_processes
-}
-
-run_genai_perf_tests() {
-  # run genai-perf tests
-
-  # $1: a json file specifying genai-perf test cases
-  local genai_perf_test_file
-  genai_perf_test_file=$1
-
-  # Iterate over genai-perf tests
-  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
-        "$server_params" "$common_params"
-    fi
-
-    if wait_for_server; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps=$num_prompts
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-      backend=$CURRENT_LLM_SERVING_ENGINE
-
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-      #TODO: add output dir.
-      client_command="genai-perf profile \
-        -m $model \
-        --service-kind openai \
-        --backend "$backend" \
-        --endpoint-type chat \
-        --streaming \
-        --url localhost:$port \
-        --request-rate $qps \
-        --num-prompts $num_prompts \
-      "
-
-    echo "Client command: $client_command"
-
-    eval "$client_command"
-
-    #TODO: process/record outputs
-    done
-  done
-
-  kill_gpu_processes
-
-}
-
-prepare_dataset() {
-
-  # download sharegpt dataset
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  echo "" > sonnet_4x.txt
-  for _ in {1..4}
-  do
-    cat sonnet.txt >> sonnet_4x.txt
-  done
-
-}
-
-main() {
-
-  # check if the environment variable is successfully injected from yaml
-
-  check_gpus
-  check_hf_token
-  get_current_llm_serving_engine
-
-  pip install -U transformers
-
-  pip install -r requirements/dev.txt
-  which genai-perf
-
-  # check storage
-  df -h
-
-  ensure_installed wget
-  ensure_installed curl
-  ensure_installed jq
-  # genai-perf dependency
-  ensure_installed libb64-0d
-
-  prepare_dataset
-
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-
-  # run the test
-  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
-
-  # run genai-perf tests
-  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
-  mv artifacts/ $RESULTS_FOLDER/
-
-  # upload benchmark results to buildkite
-  python3 -m pip install tabulate pandas
-  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -1,82 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import datetime
-import json
-import os
-from pathlib import Path
-
-import pandas as pd
-from tabulate import tabulate
-
-results_folder = Path("results/")
-
-# serving results and the keys that will be printed into markdown
-serving_results = []
-serving_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    "completed": "Successful req.",
-    "request_throughput": "Tput (req/s)",
-    "mean_ttft_ms": "Mean TTFT (ms)",
-    "std_ttft_ms": "Std TTFT (ms)",
-    "median_ttft_ms": "Median TTFT (ms)",
-    "mean_itl_ms": "Mean ITL (ms)",
-    "std_itl_ms": "Std ITL (ms)",
-    "median_itl_ms": "Median ITL (ms)",
-    "mean_tpot_ms": "Mean TPOT (ms)",
-    "std_tpot_ms": "Std TPOT (ms)",
-    "median_tpot_ms": "Median TPOT (ms)",
-    "total_token_throughput": "Total Token Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
-    "total_input_tokens": "Total input tokens",
-    "total_output_tokens": "Total output tokens",
-    "engine": "Engine",
-}
-
-if __name__ == "__main__":
-    # collect results
-    for test_file in results_folder.glob("*.json"):
-        with open(test_file) as f:
-            raw_result = json.loads(f.read())
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands")) as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        serving_results.append(raw_result)
-        continue
-
-    serving_results = pd.DataFrame.from_dict(serving_results)
-
-    if not serving_results.empty:
-        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
-            columns=serving_column_mapping
-        )
-
-    serving_md_table_with_headers = tabulate(
-        serving_results, headers="keys", tablefmt="pipe", showindex=False
-    )
-    # remove the first line of header
-    serving_md_table_lines = serving_md_table_with_headers.split("\n")
-    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
-
-    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
-
-    # document benchmarking results in markdown
-    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
-        # document results with header.
-        # for those who wants to reproduce our benchmark.
-        f.write(serving_md_table_with_headers)
-        f.write("\n")
-
-    # document benchmarking results in json
-    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
-        results = serving_results.to_dict(orient="records")
-        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -1,23 +0,0 @@
-#!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
-if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
-    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
-else
-    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
-fi
-
-TIMEOUT_SECONDS=10
-
-retries=0
-while [ $retries -lt 1000 ]; do
-    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
-        exit 0
-    fi
-
-    echo "Waiting for image to be available..."
-
-    retries=$((retries + 1))
-    sleep 5
-done
-
-exit 1
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@ -1,30 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    },
-    {
-        "test_name": "latency_llama8B_tp4",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@ -1,32 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp1",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_llama8B_tp4",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@ -2,40 +2,23 @@

 ## Introduction

-This directory contains two sets of benchmark for vllm.
-
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
-
-See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
+vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.

 ## Performance benchmark quick overview

-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.

 **Benchmarking Duration**: about 1hr.

 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.

-## Nightly benchmark quick overview
-
-**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
-
-**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
-
-**Benchmarking Duration**: about 3.5hrs.
-
 ## Trigger the benchmark

-Performance benchmark will be triggered when:
-
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
-
-Manually Trigger the benchmark
+The benchmark needs to be triggered manually:

 ```bash
-bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 ```

 Runtime environment variables:
@ -47,10 +30,6 @@ Runtime environment variables:
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.

-Nightly benchmark will be triggered when:
-
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
-
 ## Performance benchmark details

 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
@ -152,26 +131,3 @@ Here is an example using the script to compare result_a and result_b with Model,
 A comparison diagram will be generated below the table.
 Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
 <img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
-
-## Nightly test details
-
-See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
-
-### Workflow
-
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
-
-### Nightly tests
-
-In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
-
-### Docker containers
-
-The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
-
-WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
-
-WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
--- a/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@ -7,6 +7,7 @@ from importlib import util

 import pandas as pd

+pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None


@ -109,7 +110,10 @@ def compare_data_columns(
        if len(compare_frames) >= 2:
            base = compare_frames[0]
            current = compare_frames[-1]
-            ratio = current / base
+            if "P99" in data_column or "Median" in data_column:
+                ratio = base / current  # for latency
+            else:
+                ratio = current / base
            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)
@ -199,6 +203,71 @@ def split_json_by_tp_pp(
    return saved_paths


+def _add_limit_line(fig, y_value, label):
+    # Visible dashed line + annotation
+    fig.add_hline(
+        y=y_value,
+        line_dash="dash",
+        line_color="red" if "ttft" in label.lower() else "blue",
+        annotation_text=f"{label}: {y_value} ms",
+        annotation_position="top left",
+    )
+    # Optional: add a legend item (as a transparent helper trace)
+    if plot and plotly_found:
+        import plotly.graph_objects as go
+
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="lines",
+                line=dict(
+                    dash="dash", color="red" if "ttft" in label.lower() else "blue"
+                ),
+                name=f"{label}",
+            )
+        )
+
+
+def _find_concurrency_col(df: pd.DataFrame) -> str:
+    for c in [
+        "# of max concurrency.",
+        "# of max concurrency",
+        "Max Concurrency",
+        "max_concurrency",
+        "Concurrency",
+    ]:
+        if c in df.columns:
+            return c
+    # Fallback: guess an integer-like column (harmless if unused)
+    for c in df.columns:
+        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
+            return c
+    return "# of max concurrency."
+
+
+def _highlight_threshold(
+    df: pd.DataFrame, threshold: float
+) -> "pd.io.formats.style.Styler":
+    """Highlight numeric per-configuration columns with value <= threshold."""
+    conc_col = _find_concurrency_col(df)
+    key_cols = [
+        c
+        for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
+        if c in df.columns
+    ]
+    conf_cols = [
+        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
+    ]
+    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
+    return df.style.map(
+        lambda v: "background-color:#e6ffe6;font-weight:bold;"
+        if pd.notna(v) and v <= threshold
+        else "",
+        subset=conf_cols,
+    )
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
@ -220,6 +289,26 @@ if __name__ == "__main__":
        default="# of max concurrency.",
        help="column name to use as X Axis in comparison graph",
    )
+    parser.add_argument(
+        "-l",
+        "--latency",
+        type=str,
+        default="p99",
+        help="take median|p99 for latency like TTFT/TPOT",
+    )
+    parser.add_argument(
+        "--ttft-max-ms",
+        type=float,
+        default=3000.0,
+        help="Reference limit for TTFT plots (ms)",
+    )
+    parser.add_argument(
+        "--tpot-max-ms",
+        type=float,
+        default=100.0,
+        help="Reference limit for TPOT plots (ms)",
+    )
+
    args = parser.parse_args()

    drop_column = "P99"
@ -234,12 +323,22 @@ if __name__ == "__main__":
        "# of max concurrency.",
        "qps",
    ]
-    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
-    html_msgs_for_data_cols = [
-        "Compare Output Tokens /n",
-        "Median TTFT /n",
-        "Median TPOT /n",
-    ]
+
+    if "median" in args.latency:
+        data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
+        html_msgs_for_data_cols = [
+            "Compare Output Tokens /n",
+            "Median TTFT /n",
+            "Median TPOT /n",
+        ]
+        drop_column = "P99"
+    elif "p99" in args.latency:
+        data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
+        html_msgs_for_data_cols = [
+            "Compare Output Tokens /n",
+            "P99 TTFT /n",
+            "P99 TPOT /n",
+        ]

    if len(args.file) == 1:
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
@ -275,33 +374,83 @@ if __name__ == "__main__":
                    f"Expected subset: {filtered_info_cols}, "
                    f"but DataFrame has: {list(output_df.columns)}"
                )
-            output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            # output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            output_df_sorted = output_df.sort_values(by=args.xaxis)
            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
            for name, group in output_groups:
-                html = group.to_html()
+                group_name = (
+                    ",".join(map(str, name)).replace(",", "_").replace("/", "-")
+                )
+                group_html_name = "perf_comparison_" + group_name + ".html"
+
+                metric_name = str(data_cols_to_compare[i]).lower()
+                if "tok/s" in metric_name:
+                    html = group.to_html()
+                elif "ttft" in metric_name:
+                    styler = _highlight_threshold(group, args.ttft_max_ms).format(
+                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
+                        na_rep="—",
+                    )
+                    html = styler.to_html(
+                        table_attributes='border="1" class="dataframe"'
+                    )
+                elif (
+                    "tpot" in metric_name
+                    or "median" in metric_name
+                    or "p99" in metric_name
+                ):
+                    styler = _highlight_threshold(group, args.tpot_max_ms).format(
+                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
+                        na_rep="—",
+                    )
+                    html = styler.to_html(
+                        table_attributes='border="1" class="dataframe"'
+                    )
+
                text_file.write(html_msgs_for_data_cols[i])
                text_file.write(html)
+                with open(group_html_name, "a+") as sub_text_file:
+                    sub_text_file.write(html_msgs_for_data_cols[i])
+                    sub_text_file.write(html)

-                if plot and plotly_found:
-                    import plotly.express as px
+                    if plot and plotly_found:
+                        import plotly.express as px

-                    df = group[raw_data_cols]
-                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                    # Melt DataFrame for plotting
-                    df_melted = df_sorted.melt(
-                        id_vars=info_cols[y_axis_index],
-                        var_name="Configuration",
-                        value_name=data_cols_to_compare[i],
-                    )
-                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                    # Create Plotly line chart
-                    fig = px.line(
-                        df_melted,
-                        x=info_cols[y_axis_index],
-                        y=data_cols_to_compare[i],
-                        color="Configuration",
-                        title=title,
-                        markers=True,
-                    )
-                    # Export to HTML
-                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
+                        df = group[raw_data_cols]
+                        df_sorted = df.sort_values(by=info_cols[y_axis_index])
+                        # Melt DataFrame for plotting
+                        df_melted = df_sorted.melt(
+                            id_vars=info_cols[y_axis_index],
+                            var_name="Configuration",
+                            value_name=data_cols_to_compare[i],
+                        )
+                        title = (
+                            data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                        )
+                        # Create Plotly line chart
+                        fig = px.line(
+                            df_melted,
+                            x=info_cols[y_axis_index],
+                            y=data_cols_to_compare[i],
+                            color="Configuration",
+                            title=title,
+                            markers=True,
+                        )
+
+                        # ---- Add threshold lines based on metric name ----
+                        if "ttft" in metric_name:
+                            _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
+                        elif (
+                            "tpot" in metric_name
+                            or "median" in metric_name
+                            or "p99" in metric_name
+                        ):
+                            _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
+
+                        # Export to HTML
+                        text_file.write(
+                            fig.to_html(full_html=True, include_plotlyjs="cdn")
+                        )
+                        sub_text_file.write(
+                            fig.to_html(full_html=True, include_plotlyjs="cdn")
+                        )
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@ -63,9 +63,11 @@ serving_column_mapping = {
    "mean_ttft_ms": "Mean TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "p99_ttft_ms": "P99 TTFT (ms)",
+    "std_ttft_ms": "STD TTFT (ms)",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "median_tpot_ms": "Median",
    "p99_tpot_ms": "P99",
+    "std_tpot_ms": "STD TPOT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "p99_itl_ms": "P99 ITL (ms)",
@ -368,7 +370,7 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
+            lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
        )

    # get markdown tables
@ -390,7 +392,7 @@ if __name__ == "__main__":
    json_file = "benchmark_results.json"
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
-            "../.buildkite/nightly-benchmarks/"
+            "../.buildkite/performance-benchmarks/"
            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
--- a/.buildkite/performance-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/performance-benchmarks/scripts/launch-server.sh
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@ -469,7 +469,12 @@ main() {
  ensure_sharegpt_downloaded
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
-  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+  QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
+
+  # dump vllm info via vllm collect-env
+  env_output=$(vllm collect-env)
+
+  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"

  # benchmarking
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
--- a/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
--- a/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
@ -0,0 +1,26 @@
+[
+    {
+        "test_name": "latency_llama8B_tp2",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/latency-tests.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests.json
--- a/.buildkite/performance-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/performance-benchmarks/tests/nightly-tests.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
@ -95,6 +95,38 @@
            "num_prompts": 200
        }
    },
+    {
+        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@ -233,6 +265,41 @@
            "num_prompts": 1000
        }
    },
+    {
+        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
        "qps_list": ["inf"],
@ -365,6 +432,38 @@
            "num_prompts": 200
        }
    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@ -503,6 +602,41 @@
            "num_prompts": 1000
        }
    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
        "qps_list": ["inf"],
@ -638,6 +772,39 @@
            "num_prompts": 200
        }
    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@ -780,6 +947,42 @@
            "num_prompts": 1000
        }
    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
        "qps_list": ["inf"],
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@ -2,7 +2,7 @@
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -28,13 +28,13 @@
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -60,13 +60,13 @@
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 32
        }
    },
    {
-        "test_name": "serving_llama8B_tp4_sharegpt",
+        "test_name": "serving_llama8B_tp1_random_128_128",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -76,39 +76,7 @@
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp4_random_1024_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
+            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@ -124,16 +92,16 @@
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
-	    "random-input-len": 1024,
+	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 32
        }
    },
    {
-        "test_name": "serving_llama8B_pp6_random_1024_128",
+        "test_name": "serving_llama8B_tp2_random_128_128",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -143,7 +111,7 @@
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 6,
+            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@ -159,10 +127,150 @@
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
-	    "random-input-len": 1024,
+	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp1_random_128_2048",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 2048,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_128_2048",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 2048,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp1_random_2048_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 2048,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_2048_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 2048,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 32
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
@ -0,0 +1,27 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp2",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests.json
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,5 +1,5 @@
 steps:
-  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  # aarch64 + CUDA builds
  - label: "Build arm64 wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-arm64-cuda-12-9
@ -8,13 +8,28 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

+  # aarch64 build
+  - label: "Build arm64 CPU wheel"
+    depends_on: ~
+    id: build-wheel-arm64-cpu
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.8"
    depends_on: ~
    id: build-wheel-cuda-12-8
@ -28,20 +43,6 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  - label: "Build wheel - CUDA 12.6"
-    depends_on: ~
-    id: build-wheel-cuda-12-6
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
@ -55,6 +56,20 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

+  - label: "Build wheel - CUDA 13.0"
+    depends_on: ~
+    id: build-wheel-cuda-13-0
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # Build release images (12.9)
  - label: "Build release image (x86)"
    depends_on: ~
    id: build-release-image-x86
@ -62,13 +77,12 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
      # re-tag to default image tag and push, just in case arm64 build fails
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

-  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build release image (arm64)"
    depends_on: ~
    id: build-release-image-arm64
@ -76,7 +90,7 @@ steps:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"

  # Add job to create multi-arch manifest
@ -142,6 +156,22 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

+  - block: "Build arm64 CPU release image"
+    key: block-arm64-cpu-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish arm64 CPU release image"
+    depends_on: block-arm64-cpu-release-image-build
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"
+
  - label: "Build and publish nightly multi-arch image to DockerHub"
    depends_on:
      - create-multi-arch-manifest
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -70,7 +70,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"

  # Note: disable it until supports V1
  # Run AWQ test
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -20,7 +20,10 @@ trap remove_docker_container EXIT

 # Run the image and test offline inference/tensor parallel
 docker run \
-    --device /dev/dri \
+    --device /dev/dri:/dev/dri \
+    --net=host \
+    --ipc=host \
+    --privileged \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
    -e "HF_TOKEN=${HF_TOKEN}" \
@ -42,7 +45,7 @@ docker run \
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"

-if [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
-else
+if [[ $normal_wheel == *"cu129"* ]]; then
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+else
+    echo "Skipping index files for non-cu129 wheels"
 fi

 # generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"

-if [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
-else
+if [[ $normal_wheel == *"cu129"* ]]; then
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+else
+    echo "Skipping index files for non-cu129 wheels"
 fi

 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -38,7 +38,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@ -172,6 +172,8 @@ steps:
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
@ -203,6 +205,24 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
 - label: EPLB Algorithm Test # 5min
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
@ -311,6 +331,15 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

+- label: V1 Test attention (H100) # 10min
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
 - label: V1 Test others (CPU) # 5 mins
  source_file_dependencies:
    - vllm/
@ -349,7 +378,8 @@ steps:
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536

 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@ -384,7 +414,12 @@ steps:
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
      --ignore=lora/test_chatglm3_tp.py \
      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py
+      --ignore=lora/test_llm_with_multi_loras.py \
+      --ignore=lora/test_olmoe_tp.py \
+      --ignore=lora/test_deepseekv2_tp.py \
+      --ignore=lora/test_gptoss.py \
+      --ignore=lora/test_qwen3moe_tp.py
+
  parallelism: 4

 - label: PyTorch Compilation Unit Tests # 15min
@ -416,8 +451,8 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s compile/piecewise/

- label: PyTorch Fullgraph Test # 20min
-  timeout_in_minutes: 30
+- label: PyTorch Fullgraph Test # 22min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -425,6 +460,19 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
+  - pytest -v -s compile/test_fusions_e2e.py
+
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py

 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@ -468,6 +516,8 @@ steps:
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
@ -527,8 +577,9 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -677,8 +728,10 @@ steps:
  - vllm/
  - tests/models/language/generation
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
@ -733,6 +786,16 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
@ -796,8 +859,8 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 38 min
-  timeout_in_minutes: 60
+- label: Blackwell Test # 21 min
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -810,8 +873,6 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/fusion.py
-  - vllm/compilation/fusion_attn.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@ -828,15 +889,32 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    # Fusion
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+
+- label: Blackwell Fusion Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py

 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@ -943,6 +1021,8 @@ steps:
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
@ -950,6 +1030,7 @@ steps:
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@ -993,6 +1074,11 @@ steps:
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@ -1032,6 +1118,7 @@ steps:
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py


 - label: Weight Loading Multiple GPU Test  # 33min
@ -1058,6 +1145,17 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt

+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+

 ##### multi gpus test #####
 ##### A100 test #####
@ -1088,8 +1186,21 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
 ##### H200 test #####
- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
@ -1097,6 +1208,8 @@ steps:
  commands:
    - pytest -v -s tests/compile/test_async_tp.py
    - pytest -v -s tests/compile/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048

--- a/.coveragerc
+++ b/.coveragerc
@ -1,5 +1,10 @@
 [run]
-source = vllm
+# Track the installed vllm package (this is what actually gets imported during tests)
+# Use wildcard pattern to match the installed location
+source =
+    vllm
+    */dist-packages/vllm
+    */site-packages/vllm
 omit =
    */tests/*
    */test_*
@ -12,6 +17,16 @@ omit =
    */benchmarks/*
    */docs/*

+[paths]
+# Map all possible vllm locations to a canonical "vllm" path
+# This ensures coverage.combine properly merges data from different test runs
+source =
+    vllm
+    /vllm-workspace/src/vllm
+    /vllm-workspace/vllm
+    */site-packages/vllm
+    */dist-packages/vllm
+
 [report]
 exclude_lines =
    pragma: no cover
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -0,0 +1,4 @@
+# Migrate from `yapf` & `isort` to `ruff`
+d6953beb91da4e9c99be4c0a1304a2d24189535c
+# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
+8fcaaf6a165e661f63fc51be906bc05b0767332f
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,10 +5,8 @@
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/model_executor/layers/fused_moe @mgoin
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
+/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
@ -26,9 +24,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345

 # vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/attention @LucasWilkinson
-/vllm/v1/attention/backends/flashinfer.py @mgoin
+/vllm/v1/attention/backends/mla @pavanimajety
+/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
@ -47,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
@ -60,7 +58,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/offloading @ApostaC

 # Transformers backend
-/vllm/model_executor/models/transformers.py @hmellor
+/vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor

 # Docs
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -108,7 +108,7 @@ pull_request_rules:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
      - files~=^tests/benchmarks/
-      - files~=^\.buildkite/nightly-benchmarks/
+      - files~=^\.buildkite/performance-benchmarks/
  actions:
    label:
      add:
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -13,6 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
+        id: label-step
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
@ -42,7 +43,6 @@ jobs:
                    searchIn: "body"
                  },
                ],
-                
                // Substring search - matches anywhere in text (partial matches)
                substrings: [
                  {
@ -89,14 +89,12 @@ jobs:
                    term: "hip_",
                    searchIn: "both"
                  },
-                  
                  // ROCm tools and libraries
                  {
                    term: "hipify",
                    searchIn: "both"
                  },
                ],
-                
                // Regex patterns - for complex pattern matching
                regexPatterns: [
                  {
@ -107,13 +105,17 @@ jobs:
                  }
                ],
              },
+              // Add more label configurations here as needed
+              // example: {
+              //   keywords: [...],
+              //   substrings: [...],
+              //   regexPatterns: [...]
+              // },
            };
-            
            // Helper function to create regex based on search type
            function createSearchRegex(term, type) {
              // Escape special regex characters in the term
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-              
              switch (type) {
                case 'keyword':
                  // Word boundary search - matches whole words only
@ -125,16 +127,13 @@ jobs:
                  throw new Error(`Unknown search type: ${type}`);
              }
            }
-            
            // Helper function to find matching terms in text with line information
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
              const matches = [];
              const lines = text.split('\n');
-              
              for (const termConfig of searchTerms) {
                let regex;
                let term, searchIn, pattern, description, flags;
-                
                // Handle different input formats (string or object)
                if (typeof termConfig === 'string') {
                  term = termConfig;
@ -146,21 +145,17 @@ jobs:
                  description = termConfig.description;
                  flags = termConfig.flags;
                }
-                
                // Skip if this term shouldn't be searched in the current location
                if (searchIn !== 'both' && searchIn !== searchLocation) {
                  continue;
                }
-                
                // Create appropriate regex
                if (searchType === 'regex') {
                  regex = new RegExp(pattern, flags || "gi");
                } else {
                  regex = createSearchRegex(term, searchType);
                }
-                
                const termMatches = [];
-                
                // Check each line for matches
                lines.forEach((line, lineIndex) => {
                  const lineMatches = line.match(regex);
@ -175,15 +170,14 @@ jobs:
                        originalTerm: term || pattern,
                        description: description,
                        // Show context around the match in the line
-                        context: line.length > 100 ? 
-                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
-                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
+                        context: line.length > 100 ?
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
                          : line.trim()
                      });
                    });
                  }
                });
-                
                if (termMatches.length > 0) {
                  matches.push({
                    term: term || (description || pattern),
@ -196,64 +190,48 @@ jobs:
                  });
                }
              }
-              
              return matches;
            }
-            
            // Helper function to check if label should be added
            async function processLabel(labelName, config) {
              const body = context.payload.issue.body || "";
              const title = context.payload.issue.title || "";
-              
              core.notice(`Processing label: ${labelName}`);
              core.notice(`Issue Title: "${title}"`);
              core.notice(`Issue Body length: ${body.length} characters`);
-              
              let shouldAddLabel = false;
              let allMatches = [];
              let reason = '';
-              
              const keywords = config.keywords || [];
              const substrings = config.substrings || [];
              const regexPatterns = config.regexPatterns || [];
-              
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
-              
              // Search in title
              if (title.trim()) {
                core.notice(`Searching in title: "${title}"`);
-                
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
-                
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
              }
-              
              // Search in body
              if (body.trim()) {
                core.notice(`Searching in body (${body.length} characters)`);
-                
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
-                
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
              }
-              
              if (allMatches.length > 0) {
                core.notice(`Found ${allMatches.length} matching term(s):`);
-                
                for (const termMatch of allMatches) {
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
-                  
                  if (termMatch.searchType === 'regex') {
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  } else {
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  }
-                  
                  // Show details for each match
                  termMatch.matches.forEach((match, index) => {
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -266,7 +244,6 @@ jobs:
                    }
                  });
                }
-                
                shouldAddLabel = true;
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -274,13 +251,10 @@ jobs:
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
-                
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
              }
-              
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
-              
              if (shouldAddLabel) {
                const existingLabels = context.payload.issue.labels.map(l => l.name);
                if (!existingLabels.includes(labelName)) {
@ -296,14 +270,92 @@ jobs:
                core.notice(`Label "${labelName}" already present.`);
                return false;
              }
-              
              core.notice(`No matching terms found for label "${labelName}".`);
              return false;
            }
-            
            // Process all configured labels
-            const processLabels = Object.entries(labelConfig)
-              .map(([labelName, config]) => processLabel(labelName, config));
-            const labelsAdded = await Promise.all(processLabels);
-            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
-            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+            const labelsAddedResults = await Promise.all(
+              Object.entries(labelConfig).map(([labelName, config]) => 
+                processLabel(labelName, config).then(added => ({ labelName, added }))
+              )
+            );
+            
+            const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+            
+            // Return which labels were added for the next step
+            const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
+            core.setOutput('labels_added', JSON.stringify(addedLabels));
+            return addedLabels;
+
+      - name: CC users for labeled issues
+        if: steps.label-step.outputs.labels_added != '[]'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
+        with:
+          script: |
+            // Configuration: Map labels to GitHub users to CC
+            // You can add multiple users per label, and multiple label configurations
+            const ccConfig = {
+              rocm: {
+                users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3']
+                message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions
+              },
+              // Add more label -> user mappings here
+              // Example:
+              // cuda: {
+              //   users: ['user1', 'user2'],
+              //   message: 'CC {users} for CUDA-related issue'
+              // },
+              // performance: {
+              //   users: ['perfexpert'],
+              //   message: 'CC {users} for performance issue'
+              // },
+            };
+            
+            const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
+            core.notice(`Labels added: ${labelsAdded.join(', ')}`);
+            
+            // Get existing comments to check for already mentioned users
+            const comments = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            
+            const issueBody = context.payload.issue.body || '';
+            const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
+            
+            // Process each label that was added
+            for (const label of labelsAdded) {
+              if (ccConfig[label]) {
+                const config = ccConfig[label];
+                const usersToMention = [];
+                
+                // Check which users haven't been mentioned yet
+                for (const user of config.users) {
+                  const mentionPattern = new RegExp(`@${user}\\b`, 'i');
+                  if (!mentionPattern.test(allExistingText)) {
+                    usersToMention.push(user);
+                  } else {
+                    core.notice(`@${user} already mentioned for label "${label}", skipping`);
+                  }
+                }
+                
+                // Post comment if there are users to mention
+                if (usersToMention.length > 0) {
+                  const mentions = usersToMention.map(u => `@${u}`).join(' ');
+                  const message = config.message.replace('{users}', mentions);
+                  
+                  await github.rest.issues.createComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: context.issue.number,
+                    body: message
+                  });
+                  
+                  core.notice(`CC comment added for label "${label}": ${mentions}`);
+                } else {
+                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
+                }
+              }
+            }
--- a/.gitignore
+++ b/.gitignore
@ -94,6 +94,9 @@ ipython_config.py
 # generated files
 **/generated/**

+# uv
+uv.lock
+
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@ -4,7 +4,6 @@ MD013: false
 MD024:
  siblings_only: true
 MD033: false
-MD042: false
 MD045: false
 MD046: false
 MD051: false
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -16,6 +16,7 @@ repos:
  rev: v1.38.1
  hooks:
  - id: typos
+    args: [--force-exclude]
 - repo: https://github.com/pre-commit/mirrors-clang-format
  rev: v21.1.2
  hooks:
@ -37,18 +38,18 @@ repos:
  rev: 0.9.1
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
  - id: format-torch-nightly-test
    name: reformat nightly_torch_test.txt to be in sync with test.in
    language: python
-    entry: python tools/generate_nightly_torch_test.py
+    entry: python tools/pre_commit/generate_nightly_torch_test.py
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
-    name: Run mypy for local Python installation
-    entry: python tools/pre_commit/mypy.py 0 "local"
+    name: Run mypy locally for lowest supported Python version
+    entry: python tools/pre_commit/mypy.py 0 "3.10"
    stages: [pre-commit] # Don't run in CI
    <<: &mypy_common
      language: python
@ -77,12 +78,12 @@ repos:
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
-    entry: tools/shellcheck.sh
+    entry: tools/pre_commit/shellcheck.sh
    language: script
    types: [shell]
  - id: png-lint
    name: Lint PNG exports from excalidraw
-    entry: tools/png-lint.sh
+    entry: tools/pre_commit/png-lint.sh
    language: script
    types: [png]
  - id: signoff-commit
@ -99,12 +100,12 @@ repos:
    stages: [commit-msg]
  - id: check-spdx-header
    name: Check SPDX headers
-    entry: python tools/check_spdx_header.py
+    entry: python tools/pre_commit/check_spdx_header.py
    language: python
    types: [python]
  - id: check-root-lazy-imports
    name: Check root lazy imports
-    entry: python tools/check_init_lazy_imports.py
+    entry: python tools/pre_commit/check_init_lazy_imports.py
    language: python
    types: [python]
  - id: check-filenames
@ -118,11 +119,11 @@ repos:
    pass_filenames: false
  - id: update-dockerfile-graph
    name: Update Dockerfile dependency graph
-    entry: tools/update-dockerfile-graph.sh
+    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
  - id: enforce-import-regex-instead-of-re
    name: Enforce import regex as re
-    entry: python tools/enforce_regex_import.py
+    entry: python tools/pre_commit/enforce_regex_import.py
    language: python
    types: [python]
    pass_filenames: false
@ -130,7 +131,7 @@ repos:
  # forbid directly import triton
  - id: forbid-direct-triton-import
    name: "Forbid direct 'import triton'"
-    entry: python tools/check_triton_import.py
+    entry: python tools/pre_commit/check_triton_import.py
    language: python
    types: [python]
    pass_filenames: false
@ -143,7 +144,7 @@ repos:
    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
-    entry: python tools/validate_config.py
+    entry: python tools/pre_commit/validate_config.py
    language: python
    additional_dependencies: [regex]
  # Keep `suggestion` last
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")

 #
 # Try to find python package with an executable that exactly matches
@ -883,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
+  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
--- a/README.md
+++ b/README.md
@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio

 *Latest News* 🔥

+- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
 - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -5,7 +5,7 @@ import gc
 from benchmark_utils import TimeCollector
 from tabulate import tabulate

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool


--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@ -46,7 +46,7 @@ import time

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -19,7 +19,7 @@ from vllm.config import (
    VllmConfig,
 )
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -37,7 +37,7 @@ from transformers import PreTrainedTokenizerBase

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -11,7 +11,7 @@ import time
 from transformers import AutoTokenizer, PreTrainedTokenizerBase

 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 # Select a equi-probable random priority
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -31,6 +31,7 @@ import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
+from contextlib import nullcontext
 from dataclasses import dataclass

 import datasets
@ -50,7 +51,7 @@ except ImportError:
    from backend_request_func import get_tokenizer

 try:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

@ -501,15 +502,9 @@ async def benchmark(

    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

-    # This can be used once the minimum Python version is 3.10 or higher,
-    # and it will simplify the code in limited_request_func.
-    #    semaphore = (asyncio.Semaphore(max_concurrency)
-    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()

    async def limited_request_func(request_func_input, pbar):
-        if semaphore is None:
-            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
            return await request_func(request_func_input=request_func_input, pbar=pbar)

--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -15,7 +15,7 @@ from utils import make_rand_sparse_tensors
 from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -18,7 +18,8 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    w8a8_triton_block_scaled_mm,
 )
-from vllm.utils import FlexibleArgumentParser, cdiv
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.math_utils import cdiv

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -10,7 +10,8 @@ import torch
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


 def with_triton_mode(fn):
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

 batch_size_range = [1, 16, 32, 64, 128]
 seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@ -28,7 +28,7 @@ except ImportError as e:

 from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 parser = FlexibleArgumentParser(
    description="Benchmark BitBLAS int4 on a specific target."
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.scalar_type import scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 WEIGHT_SHAPES_MOE = {
    "nvidia/DeepSeek-R1-FP4": [
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 # Weight shapes for different models: [num_experts, topk, hidden_size,
 # intermediate_size]
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
 )
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 logger = init_logger(__name__)

--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_experts,
    fused_topk,
 )
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = [
    "nm-testing/Mixtral-8x7B-Instruct-v0.1",
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -7,7 +7,8 @@ import torch

 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -25,7 +25,7 @@ if HAS_TRITON:
    from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
    from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_TP_SIZES = [1]
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    quantize_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    sort_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 FP8_DTYPE = current_platform.fp8_dtype()

@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
    else:
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    block_quant_shape = get_weight_block_size_safety(config)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
 )
 from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 FP8_DTYPE = current_platform.fp8_dtype()

@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok

    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_customized_permute = args.use_customized_permute
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@ -39,7 +39,7 @@ import torch
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -9,9 +9,9 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )

--- a/benchmarks/kernels/benchmark_polynorm.py
+++ b/benchmarks/kernels/benchmark_polynorm.py
@ -1,155 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import itertools
-
-import torch
-
-from vllm import _custom_ops as vllm_ops
-from vllm.triton_utils import triton
-
-
-def polynorm_naive(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float = 1e-6,
-):
-    orig_shape = x.shape
-    x = x.view(-1, x.shape[-1])
-
-    def norm(x, eps: float):
-        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-
-    x = x.float()
-    return (
-        (
-            weight[0] * norm(x**3, eps)
-            + weight[1] * norm(x**2, eps)
-            + weight[2] * norm(x, eps)
-            + bias
-        )
-        .to(weight.dtype)
-        .view(orig_shape)
-    )
-
-
-def polynorm_vllm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float = 1e-6,
-):
-    orig_shape = x.shape
-    x = x.view(-1, x.shape[-1])
-
-    out = torch.empty_like(x)
-    vllm_ops.poly_norm(out, x, weight, bias, eps)
-    output = out
-
-    output = output.view(orig_shape)
-    return output
-
-
-def calculate_diff(batch_size, seq_len, hidden_dim):
-    dtype = torch.bfloat16
-    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
-    weight = torch.ones(3, dtype=dtype, device="cuda")
-    bias = torch.ones(1, dtype=dtype, device="cuda")
-
-    output_naive = polynorm_naive(x, weight, bias)
-    output_vllm = polynorm_vllm(x, weight, bias)
-
-    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
-        print("✅ All implementations match")
-    else:
-        print("❌ Implementations differ")
-
-
-batch_size_range = [2**i for i in range(0, 7, 2)]
-seq_length_range = [2**i for i in range(6, 11, 1)]
-dim_range = [2048, 4096]
-configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
-
-
-def get_benchmark():
-    @triton.testing.perf_report(
-        triton.testing.Benchmark(
-            x_names=["dim", "batch_size", "seq_len"],
-            x_vals=[list(_) for _ in configs],
-            line_arg="provider",
-            line_vals=["naive", "vllm"],
-            line_names=["Naive", "vLLM"],
-            styles=[("blue", "-"), ("red", "-")],
-            ylabel="us",
-            plot_name="polynorm-perf",
-            args={},
-        )
-    )
-    def benchmark(dim, batch_size, seq_len, provider):
-        dtype = torch.bfloat16
-        hidden_dim = dim * 4
-
-        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
-        weight = torch.ones(3, dtype=dtype, device="cuda")
-        bias = torch.ones(1, dtype=dtype, device="cuda")
-
-        quantiles = [0.5, 0.2, 0.8]
-
-        if provider == "naive":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_naive(x, weight, bias),
-                quantiles=quantiles,
-            )
-        else:
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_vllm(x, weight, bias),
-                quantiles=quantiles,
-            )
-
-        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
-
-    return benchmark
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=4,
-        help="Batch size",
-    )
-    parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=128,
-        help="Sequence length",
-    )
-    parser.add_argument(
-        "--hidden-dim",
-        type=int,
-        default=8192,
-        help="Intermediate size of MLP",
-    )
-    parser.add_argument(
-        "--save-path",
-        type=str,
-        default="./configs/polnorm/",
-        help="Path to save polnorm benchmark results",
-    )
-
-    args = parser.parse_args()
-
-    # Run correctness test
-    calculate_diff(
-        batch_size=args.batch_size,
-        seq_len=args.seq_len,
-        hidden_dim=args.hidden_dim,
-    )
-
-    benchmark = get_benchmark()
-    # Run performance benchmark
-    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -7,7 +7,8 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -9,9 +9,9 @@ from tabulate import tabulate
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )

--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -12,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
 )
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
    create_kv_caches_with_random_flash,
 )

--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -8,7 +8,7 @@ import torch

 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 def benchmark_rope_kernels_multi_lora(
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@ -8,7 +8,7 @@ from datetime import datetime
 import flashinfer
 import torch

-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up

 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@ -8,7 +8,7 @@ from datetime import datetime
 import flashinfer
 import torch

-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up

 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 mp.set_start_method("spawn", force=True)

--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@ -11,7 +11,7 @@ import regex as re
 import seaborn as sns
 from torch.utils.benchmark import Measurement as TMeasurement

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -1251,7 +1251,7 @@ async def main() -> None:
        default=None,
        help="The model name used in the API. "
        "If not specified, the model name will be the "
-        "same as the ``--model`` argument. ",
+        "same as the `--model` argument. ",
    )

    parser.add_argument(
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@ -5,7 +5,7 @@ import cProfile
 import pstats

 from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 # A very long prompt, total number of tokens is about 15k.
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -188,16 +188,60 @@ else()
    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()

-#
-# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
-# Flag to enable ACL kernels for AARCH64 platforms
-if (VLLM_BUILD_ACL STREQUAL "ON")
-    set(USE_ACL ON)
-else()
-    set(USE_ACL OFF)
-endif()

+# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+    # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
+    # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
+    if(ASIMD_FOUND)
+        if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
+            message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
+        else()
+            message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
+            FetchContent_Populate(arm_compute
+                SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
+                SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
+                GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
+                GIT_TAG        v52.2.0
+                GIT_SHALLOW    TRUE
+                GIT_PROGRESS   TRUE
+            )
+            set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
+        endif()
+
+        # Build ACL with scons
+        include(ProcessorCount)
+        ProcessorCount(_NPROC)
+        set(_scons_cmd
+        scons -j${_NPROC}
+            Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
+            arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
+            multi_isa=1 openmp=1 cppthreads=0
+        )
+
+        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+        # and create a local shim dir with it
+        include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
+        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+
+        if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
+            list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
+        endif()
+
+        execute_process(
+            COMMAND ${_scons_cmd}
+            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
+            RESULT_VARIABLE _acl_rc
+        )
+        if(NOT _acl_rc EQUAL 0)
+            message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
+        endif()
+
+        set(ONEDNN_AARCH64_USE_ACL "ON")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+        add_compile_definitions(VLLM_USE_ACL)
+    endif()
+
    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")

    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
@ -217,16 +261,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        )
    endif()

-    if(USE_ACL)
-        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
-        if(NOT ARM_COMPUTE_LIBRARY)
-            message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
-        endif()
-        set(ONEDNN_AARCH64_USE_ACL "ON")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
-        add_compile_definitions(VLLM_USE_ACL)
-    endif()
-
    set(ONEDNN_LIBRARY_TYPE "STATIC")
    set(ONEDNN_BUILD_DOC "OFF")
    set(ONEDNN_BUILD_EXAMPLES "OFF")
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@ -19,7 +19,7 @@ else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
+        GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS)
        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
    )

    set(FlashMLA_INCLUDES
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@ -22,10 +22,10 @@ else()
    CONFIGURE_COMMAND ""
    BUILD_COMMAND ""
  )
-  FetchContent_Populate(qutlass)
-  set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
 endif()

+FetchContent_Populate(qutlass)
+
 if(NOT qutlass_SOURCE_DIR)
  message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
 endif()
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
+          GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -129,6 +129,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
 endfunction()

+# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
+#   libgomp.so    -> libgomp-<hash>.so...
+#   libgomp.so.1  -> libgomp-<hash>.so...
+# OUTPUT: TORCH_GOMP_SHIM_DIR  ("" if not found)
+function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
+  set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)
+
+  # Use run_python to locate vendored libgomp; never throw on failure.
+  run_python(_VLLM_TORCH_GOMP_PATH
+    "
+import os, glob
+try:
+  import torch
+  torch_pkg = os.path.dirname(torch.__file__)
+  site_root = os.path.dirname(torch_pkg)
+  torch_libs = os.path.join(site_root, 'torch.libs')
+  print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
+except:
+  print('')
+"
+    "failed to probe torch.libs for libgomp")
+
+  if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
+    return()
+  endif()
+
+  # Create shim under the build tree
+  set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
+  file(MAKE_DIRECTORY "${_shim}")
+
+  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")
+
+  set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
+endfunction()
+
 # Macro for converting a `gencode` version number to a cmake version number.
 macro(string_to_ver OUT_VER IN_STR)
  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
--- a/codecov.yml
+++ b/codecov.yml
@ -0,0 +1,12 @@
+codecov:
+  require_ci_to_pass: false
+
+fixes:
+  # Map source code paths to repository root paths
+  # Wildcards match any Python version (python3.*)
+  - "/vllm-workspace/src/vllm/::vllm/"
+  - "/vllm-workspace/vllm/::vllm/"
+  - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
+  - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
+  - "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
+  - "/usr/lib/python3.*/site-packages/vllm/::vllm/"
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -125,32 +125,37 @@ public:
  }

  static void set_split_kv (KernelArguments& args) {
-    // printf("set_split_kv start");
    if (args.split_kv >= 1) return;
    auto [H, K, D, B] = args.problem_shape;
-    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
    int sm_count = args.hw_info.sm_count;
-    // printf("    sm_count = %d\n", sm_count);
-    int max_splits = ceil_div(K, 128);
-    max_splits = min(16, max_splits);
+    float seq_length_k = static_cast<float>(K) / 1024.0f;
+    int max_splits = 1;

-    // TODO: This avoids a hang when the batch size larger than 1 and 
-    // there is more than 1 kv_splits. 
-    // Discuss with NVIDIA how this can be fixed.
-    if (B > 1) {
-      max_splits = min(1, max_splits);
+    if (B <= 4 && seq_length_k >= 16) {
+      max_splits = 16;
    }
-    
-    // printf("    max_splits = %d\n", max_splits);
+    else if (B <= 8 && seq_length_k >= 4) {
+      max_splits = 8;
+    }
+    else if ((B <= 16 && seq_length_k >= 8) ||
+             (B == 48 && seq_length_k >= 32)) {
+      max_splits = 4;
+    }
+    else if ((B <= 32 && seq_length_k >= 16) ||
+             (B == 96 && seq_length_k >= 16)) {
+      max_splits = 2;
+    }
+    else {
+      max_splits = 1;
+    }
+
+    // Wave-aware scheduling: ensure integer number of waves in K dimension
    int sms_per_batch = max(1, sm_count / B);
-    // printf("    sms_per_batch = %d\n", sms_per_batch);
    int split_heur = min(max_splits, sms_per_batch);
    int waves = ceil_div(B * split_heur, sm_count);
    int k_waves = ceil_div(max_splits, split_heur);
    int split_wave_aware = ceil_div(max_splits, k_waves);
    args.split_kv = split_wave_aware;
-    // printf("    args.split_kv = %d\n", args.split_kv);
-
  }

  /// Determines whether the GEMM can execute the given problem.
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml`