[Misc] bump mistral common version (#10367 )

Signed-off-by: simon-mo <simon.mo@hey.com>
[Misc] Bump up test_fused_moe tolerance (#10364 )
2024-11-15 09:48:07 -08:00 · 2024-11-15 16:31:18 +00:00 · 2024-11-15 14:50:40 +00:00 · 2024-11-15 13:55:51 +00:00 · 2024-11-15 12:40:30 +00:00 · 2024-11-15 10:34:58 +00:00
994 changed files with 88864 additions and 31584 deletions
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.764
+  - name: "exact_match,flexible-extract"
+    value: 0.764
+limit: 250
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.356
+  - name: "exact_match,flexible-extract"
+    value: 0.358
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,6 +1,7 @@
 Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Minitron-4B-Base-FP8.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+#   pip install lm-eval==0.4.4

 usage() {
    echo``
@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
 done

 lm_eval --model hf \
-  --model_args pretrained=$MODEL,parallelize=True \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
+  --model_args "pretrained=$MODEL,parallelize=True" \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.3
+#   pip install lm-eval==0.4.4

 usage() {
    echo``
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done

 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
 done

 # Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"

 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -49,10 +49,15 @@ def test_lm_eval_correctness():
    results = launch_lm_eval(eval_config)

    # Confirm scores match ground truth.
+    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
            print(f'{task["name"]} | {metric["name"]}: '
                  f'ground_truth={ground_truth} | measured={measured_value}')
-            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
+            success = success and numpy.isclose(
+                ground_truth, measured_value, rtol=RTOL)
+
+    # Assert at the end, print all scores even on failure for debugging.
+    assert success
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -8,8 +8,7 @@ steps:
          containers:
          - image: badouralix/curl-jq
            command:
-            - sh
-            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
  - wait
  - label: "A100"
    agents:
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -0,0 +1,28 @@
+
+## Description
+
+This file contains the downloading link for benchmarking results.
+
+- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
+- [benchmarking results](artifact://results.zip)
+- [benchmarking code](artifact://nightly-benchmarks.zip)
+
+Please download the visualization scripts in the post
+
+
+## Results reproduction
+
+- Find the docker we use in `benchmarking pipeline`
+- Deploy the docker, and inside the docker:
+  - Download `nightly-benchmarks.zip`. 
+  - In the same folder, run the following code
+```
+export HF_TOKEN=<your HF token>
+apt update
+apt install -y git
+unzip nightly-benchmarks.zip
+VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+```
+
+And the results will be inside `./benchmarks/results`.
+
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -1,45 +1,39 @@

 # Nightly benchmark

-The main goal of this benchmarking is two-fold:
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
+This benchmark aims to:
+- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
+- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
+
+Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
+
+Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)


-## Docker images
+## Setup

-We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
- vllm/vllm-openai:v0.5.0.post1
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
- openmmlab/lmdeploy:v0.5.0
- ghcr.io/huggingface/text-generation-inference:2.1
+- Docker images:
+  - vLLM: `vllm/vllm-openai:v0.6.2`
+  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+- Hardware
+  - 8x Nvidia A100 GPUs
+- Workload:
+  - Dataset
+    - ShareGPT dataset
+    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+  - Models: llama-3 8B, llama-3 70B.
+    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
+  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).

-<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
+# Known issues

-
-## Hardware
-
-One AWS node with 8x NVIDIA A100 GPUs.
-
-
-## Workload description
-
-We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
-
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 500 prompts.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
-
-<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
-
-## Plots
-
-In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
-
-<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
-
-## Results
-
-{nightly_results_benchmarking_table}
+- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
+- TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec

 common_container_settings: &common_container_settings
  command:
-    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
  resources:
    limits:
      nvidia.com/gpu: 8
@ -37,7 +37,10 @@ common_container_settings: &common_container_settings

 steps:
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-  - label: "A100 trt benchmark"
+
+
+
+  - label: "A100 vllm step 10"
    priority: 100
    agents:
      queue: A100
@ -46,7 +49,21 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+              - image: vllm/vllm-openai:v0.6.2
+                <<: *common_container_settings
+
+
+
+  - label: "A100 sglang benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: lmsysorg/sglang:v0.3.2-cu121
                <<: *common_container_settings

  - label: "A100 lmdeploy benchmark"
@ -58,11 +75,13 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: openmmlab/lmdeploy:v0.5.0
+              - image: openmmlab/lmdeploy:v0.6.1-cu12
                <<: *common_container_settings
-  

-  - label: "A100 vllm benchmark"
+
+
+
+  - label: "A100 trt llama-8B"
    priority: 100
    agents:
      queue: A100
@ -71,10 +90,25 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: vllm/vllm-openai:latest 
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama8B"

-  - label: "A100 tgi benchmark"
+
+  - label: "A100 trt llama-70B"
    priority: 100
    agents:
      queue: A100
@ -83,12 +117,54 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: ghcr.io/huggingface/text-generation-inference:2.1 
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama70B"
+
+
+  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
+  # - label: "A100 trt benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+  #               <<: *common_container_settings
+
+
+  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
+  # - label: "A100 tgi benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
+  #               <<: *common_container_settings
        
  - wait

-  - label: "Plot"
+  - label: "Collect the results"
    priority: 100
    agents:
      queue: A100
@ -117,4 +193,4 @@ steps:
                    name: hf-token-secret
                    key: token

-  - wait
+  - block: ":rocket: check the results!"
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@ -1,76 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -x
-
-check_gpus() {
-    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-    if [[ $gpu_count -gt 0 ]]; then
-        echo "GPU found."
-    else
-        echo "Need at least 1 GPU to run benchmarking."
-        exit 1
-    fi
-    declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-    echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-    # check if HF_TOKEN is available and valid
-    if [[ -z "$HF_TOKEN" ]]; then
-        echo "Error: HF_TOKEN is not set."
-        exit 1
-    elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-        echo "Error: HF_TOKEN does not start with 'hf_'."
-        exit 1
-    else
-        echo "HF_TOKEN is set and valid."
-    fi
-}
-
-main() {
-
-    check_gpus
-    check_hf_token
-
-    df -h
-
-    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-    (which jq) || (apt-get update && apt-get -y install jq)
-
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-    
-
-    # run lmdeploy
-    if which lmdeploy >/dev/null; then
-        echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
-        exit 0
-    fi
-
-    # run tgi
-    if [ -e /tgi-entrypoint.sh ]; then
-        echo "tgi is available, redirect to run-tgi-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
-        exit 0
-    fi
-
-    # run trt
-    if which trtllm-build >/dev/null; then
-        echo "trtllm is available, redirect to run-trt-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
-        exit 0
-    fi
-
-    # run vllm
-    if [ -e /vllm-workspace ]; then
-        echo "vllm is available, redirect to run-vllm-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
-        exit 0
-    fi
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -56,7 +56,7 @@ serving_column_mapping = {

 def read_markdown(file):
    if os.path.exists(file):
-        with open(file, "r") as f:
+        with open(file) as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
@ -75,14 +75,14 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):

-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())

        if "serving" in str(test_file):
            # this result is generated via `benchmark_serving.py`

            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)

@ -97,7 +97,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_latency.py`

            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)

@ -119,7 +119,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_throughput.py`

            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)

--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -0,0 +1,95 @@
+import argparse
+import json
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description=
+        'Parse command line arguments for summary-nightly-results script.')
+    parser.add_argument('--results-folder',
+                        type=str,
+                        required=True,
+                        help='The folder where the results are stored.')
+    parser.add_argument('--description',
+                        type=str,
+                        required=True,
+                        help='Description of the results.')
+
+    args = parser.parse_args()
+    return args
+
+
+def get_perf(df, method, model, metric):
+
+    means = []
+
+    for qps in [2, 4, 8, 16, "inf"]:
+        target = df['Test name'].str.contains(model)
+        target = target & df['Engine'].str.contains(method)
+        target = target & df['Test name'].str.contains("qps_" + str(qps))
+        filtered_df = df[target]
+
+        if filtered_df.empty:
+            means.append(0.)
+        else:
+            means.append(filtered_df[metric].values[0])
+
+    return np.array(means)
+
+
+def get_perf_w_std(df, method, model, metric):
+
+    if metric in ["TTFT", "ITL"]:
+        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
+        mean = mean.tolist()
+        std = get_perf(df, method, model, "Std " + metric + " (ms)")
+        if std.mean() == 0:
+            std = None
+        success = get_perf(df, method, model, "Successful req.")
+        if std is not None:
+            std = std / np.sqrt(success)
+            std = std.tolist()
+
+    else:
+        assert metric == "Tput"
+        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
+            df, method, model, "Output Tput (tok/s)")
+        mean = mean.tolist()
+        std = None
+
+    return mean, std
+
+
+def main(args):
+    results_folder = Path(args.results_folder)
+
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*_nightly_results.json"):
+        with open(test_file) as f:
+            results = results + json.loads(f.read())
+
+    # generate markdown table
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+
+    with open(args.description) as f:
+        description = f.read()
+
+    description = description.format(
+        nightly_results_benchmarking_table=md_table)
+
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -0,0 +1,228 @@
+#!/bin/bash
+
+# Currently FP8 benchmark is NOT enabled.
+
+set -x
+server_params=$1
+common_params=$2
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+launch_trt_server() {
+
+  model_path=$(echo "$common_params" | jq -r '.model')
+  model_name="${model_path#*/}"
+  model_type=$(echo "$server_params" | jq -r '.model_type')
+  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
+  model_tp_size=$(echo "$common_params" | jq -r '.tp')
+  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+  max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
+  max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
+  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
+
+  # create model caching directory
+  cd ~
+  rm -rf models
+  mkdir -p models
+  cd models
+  models_dir=$(pwd)
+  trt_model_path=${models_dir}/${model_name}-trt-ckpt
+  trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+  # clone tensorrt backend
+  cd /
+  rm -rf tensorrtllm_backend
+  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+  git lfs install
+  cd tensorrtllm_backend
+  git checkout "$trt_llm_version"
+  git submodule update --init --recursive
+
+  # build trtllm engine
+  cd /tensorrtllm_backend
+  cd "./tensorrt_llm/examples/${model_type}"
+  python3 convert_checkpoint.py \
+    --model_dir "${model_path}" \
+    --dtype "${model_dtype}" \
+    --tp_size "${model_tp_size}" \
+    --output_dir "${trt_model_path}"
+  trtllm-build \
+    --checkpoint_dir "${trt_model_path}" \
+    --use_fused_mlp \
+    --reduce_fusion disable \
+    --workers 8 \
+    --gpt_attention_plugin "${model_dtype}" \
+    --gemm_plugin "${model_dtype}" \
+    --tp_size "${model_tp_size}" \
+    --max_batch_size "${max_batch_size}" \
+    --max_input_len "${max_input_len}" \
+    --max_seq_len "${max_seq_len}" \
+    --max_num_tokens "${max_num_tokens}" \
+    --output_dir "${trt_engine_path}"
+
+  # handle triton protobuf files and launch triton server
+  cd /tensorrtllm_backend
+  mkdir triton_model_repo
+  cp -r all_models/inflight_batcher_llm/* triton_model_repo/
+  cd triton_model_repo
+  rm -rf ./tensorrt_llm/1/*
+  cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
+  python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
+  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
+  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
+  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
+  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
+  cd /tensorrtllm_backend
+  python3 scripts/launch_triton_server.py \
+    --world_size="${model_tp_size}" \
+    --model_repo=/tensorrtllm_backend/triton_model_repo &
+
+}
+
+launch_tgi_server() {
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params."
+    server_command="/tgi-entrypoint.sh \
+                --model-id $model \
+                --num-shard $tp \
+                --port $port \
+                --quantize fp8 \
+                $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="/tgi-entrypoint.sh \
+                --model-id $model \
+                --num-shard $tp \
+                --port $port \
+                $server_args"
+  fi
+
+  echo "Server command: $server_command"
+  eval "$server_command" &
+
+}
+
+launch_lmdeploy_server() {
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  server_command="lmdeploy serve api_server $model \
+    --tp $tp \
+    --server-port $port \
+    $server_args"
+
+  # run the server
+  echo "Server command: $server_command"
+  bash -c "$server_command" &
+}
+
+launch_sglang_server() {
+
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+    server_command="python3 \
+        -m sglang.launch_server \
+        --tp $tp \
+        --model-path $model \
+        --port $port \
+        $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="python3 \
+        -m sglang.launch_server \
+        --tp $tp \
+        --model-path $model \
+        --port $port \
+        $server_args"
+  fi
+
+  # run the server
+  echo "Server command: $server_command"
+  eval "$server_command" &
+}
+
+launch_vllm_server() {
+
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  model=$(echo "$common_params" | jq -r '.model')
+  tp=$(echo "$common_params" | jq -r '.tp')
+  port=$(echo "$common_params" | jq -r '.port')
+  server_args=$(json2args "$server_params")
+
+  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
+    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+  else
+    echo "Key 'fp8' does not exist in common params."
+    server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+  fi
+
+  # run the server
+  echo "Server command: $server_command"
+  eval "$server_command" &
+}
+
+main() {
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
+    launch_trt_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
+    launch_tgi_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
+    launch_lmdeploy_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
+    launch_sglang_server
+  fi
+
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
+    launch_vllm_server
+  fi
+}
+
+main
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@ -1,102 +0,0 @@
-#!/bin/bash
-
-
-server_params=$1
-common_params=$2
-
-
-
-model_path=$(echo "$common_params" | jq -r '.model')
-model_name="${model_path#*/}"
-model_type=$(echo "$server_params" | jq -r '.model_type')
-model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
-model_tp_size=$(echo "$common_params" | jq -r '.tp')
-max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
-max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
-max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
-trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
-
-cd ~
-rm -rf models
-mkdir -p models
-cd models
-models_dir=$(pwd)
-trt_model_path=${models_dir}/${model_name}-trt-ckpt
-trt_engine_path=${models_dir}/${model_name}-trt-engine
-
-cd ~
-rm -rf tensorrt-demo
-git clone https://github.com/neuralmagic/tensorrt-demo.git
-cd tensorrt-demo
-tensorrt_demo_dir=$(pwd)
-
-# make sure the parameter inside tensorrt_demo is consistent to envvar
-sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
-sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
-
-
-cd /
-rm -rf tensorrtllm_backend
-git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
-git lfs install
-cd tensorrtllm_backend
-git checkout $trt_llm_version
-tensorrtllm_backend_dir=$(pwd)
-git submodule update --init --recursive
-cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
-
-cd /tensorrtllm_backend
-cd ./tensorrt_llm/examples/${model_type}
-
-
-if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
-
-    echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
-    echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
-    python ../quantization/quantize.py \
-        --model_dir ${model_path} \
-        --dtype ${model_dtype} \
-        --tp_size ${model_tp_size} \
-        --output_dir ${trt_model_path} \
-        --qformat fp8 \
-        --kv_cache_dtype fp8 \
-        --calib_size 2
-
-else
-
-    echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
-    python3 convert_checkpoint.py \
-        --model_dir ${model_path} \
-        --dtype ${model_dtype} \
-        --tp_size ${model_tp_size} \
-        --output_dir ${trt_model_path}
-
-fi
-
-
-
-trtllm-build \
--checkpoint_dir=${trt_model_path} \
--gpt_attention_plugin=${model_dtype} \
--gemm_plugin=${model_dtype} \
--remove_input_padding=enable \
--paged_kv_cache=enable \
--tp_size=${model_tp_size} \
--max_batch_size=${max_batch_size} \
--max_input_len=${max_input_len} \
--max_output_len=${max_output_len} \
--max_num_tokens=${max_output_len} \
--opt_num_tokens=${max_output_len} \
--output_dir=${trt_engine_path} 
-
-cd /tensorrtllm_backend/triton_model_repo
-rm -rf ./tensorrt_llm/1/*
-cp -r ${trt_engine_path}/* ./tensorrt_llm/1
-cd /tensorrtllm_backend
-python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@ -8,6 +8,7 @@ main() {

    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
+    (which zip) || (apt-get install -y zip)

    if [ ! -f /workspace/buildkite-agent ]; then
        echo "buildkite-agent binary not found. Skip plotting the results."
@ -15,26 +16,63 @@ main() {
    fi

    # initial annotation
-    description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"

    # download results
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    mkdir -p results/
    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
    ls
    ls results/

-    # generate figures
-    python3 -m pip install tabulate pandas matplotlib
-    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-        --description $description \
-        --results-folder results/
+    # upload benchmark results
+    zip -r results.zip results/
+    /workspace/buildkite-agent artifact upload "results.zip"
+
+    # upload benchmarking scripts
+    cd "$VLLM_SOURCE_CODE_LOC/"
+    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
+    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    # upload benchmarking pipeline
+    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
+
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
    
-    # upload results and figures
-    /workspace/buildkite-agent artifact upload "nightly_results.png"
-    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
-    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
-    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+
+
+    # The figures should be genereated by a separate process outside the CI/CD pipeline
+
+    # # generate figures
+    # python3 -m pip install tabulate pandas matplotlib
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
+    #     --description $description \
+    #     --results-folder results/ 
+
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sharegpt
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_2048_128
+
+    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+    #     --description $description \
+    #     --results-folder results/ \
+    #     --dataset sonnet_128_2048
+    
+    # # upload results and figures
+    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }

-main "$@"
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@ -1,135 +0,0 @@
-import argparse
-import json
-import math
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import pandas as pd
-from tabulate import tabulate
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description=
-        'Parse command line arguments for summary-nightly-results script.')
-    parser.add_argument('--results-folder',
-                        type=str,
-                        required=True,
-                        help='The folder where the results are stored.')
-    parser.add_argument('--description',
-                        type=str,
-                        required=True,
-                        help='Description of the results.')
-
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
-    results_folder = Path(args.results_folder)
-
-    results = []
-
-    # collect results
-    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
-            results = results + json.loads(f.read())
-
-    # generate markdown table
-    df = pd.DataFrame.from_dict(results)
-
-    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
-
-    with open(args.description, "r") as f:
-        description = f.read()
-
-    description = description.format(
-        nightly_results_benchmarking_table=md_table)
-
-    with open("nightly_results.md", "w") as f:
-        f.write(description)
-
-    plt.rcParams.update({'font.size': 20})
-
-    # plot results
-    fig, axes = plt.subplots(3, 3, figsize=(16, 14))
-    fig.subplots_adjust(hspace=1)
-    methods = ["vllm", "trt", "lmdeploy", "tgi"]
-    for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
-        for j, metric in enumerate(["TTFT", "ITL"]):
-            means, stds = [], []
-            for method in methods:
-                target = df['Test name'].str.contains(model)
-                target = target & df['Engine'].str.contains(method)
-                filtered_df = df[target]
-
-                if filtered_df.empty:
-                    means.append(0.)
-                    stds.append(0.)
-                else:
-                    means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
-                    std = filtered_df[f"Std {metric} (ms)"].values[0]
-                    success = filtered_df["Successful req."].values[0]
-                    stds.append(std / math.sqrt(success))
-
-            print(model, metric)
-            print(means, stds)
-
-            ax = axes[i, j + 1]
-
-            bars = ax.bar(
-                ["vllm", "trt", "lmdeploy", "tgi"],
-                means,
-                yerr=stds,
-                capsize=10,
-            )
-            for idx, bar in enumerate(bars):
-                bar.set_color(bar_colors[idx])
-            ax.set_ylim(bottom=0)
-
-            ax.set_ylabel(f"{metric} (ms)")
-            ax.set_title(f"{model} {metric}")
-            ax.grid(axis='y')
-
-        metric = "Tput"
-        j = 0
-        if True:
-            tputs = []
-            for method in methods:
-                target = df['Test name'].str.contains(model)
-                target = target & df['Engine'].str.contains(method)
-                filtered_df = df[target]
-
-                if filtered_df.empty:
-                    tputs.append(0.)
-                else:
-                    input_tput = filtered_df["Input Tput (tok/s)"].values[0]
-                    output_tput = filtered_df["Output Tput (tok/s)"].values[0]
-                    tputs.append(input_tput + output_tput)
-
-            print(model, metric)
-            print(tputs)
-
-            ax = axes[i, j]
-
-            bars = ax.bar(
-                ["vllm", "trt", "lmdeploy", "tgi"],
-                tputs,
-            )
-            for idx, bar in enumerate(bars):
-                bar.set_color(bar_colors[idx])
-
-            ax.set_ylim(bottom=0)
-
-            ax.set_ylabel("Tput (token/s)")
-            ax.set_title(f"{model} {metric}")
-            ax.grid(axis='y')
-
-    fig.tight_layout()
-    fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@ -1,218 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  pkill lmdeploy || true
-  # waiting for GPU processes to be fully killed
-  sleep 10
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-    
-    # append lmdeploy to the test name
-    test_name=lmdeploy_$test_name
-    
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
-    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    # prepare tokenizer
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-
-    server_command="lmdeploy serve api_server $model \
-      --tp $tp \
-      --server-port $port \
-      $server_args"
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    bash -c "$server_command" &
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "lmdeploy server is up and running."
-    else
-      echo ""
-      echo "lmdeploy failed to start within the timeout period."
-      break
-    fi
-
-    # get model name
-    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend lmdeploy \
-        --tokenizer /tokenizer_cache \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        --model \"$model_name\" \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "lmdeploy" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-main() {
-
-  check_gpus
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  python -m pip install transformers==4.41.2
-
-  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  python -m pip install tabulate pandas
-  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -0,0 +1,355 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+get_current_llm_serving_engine() {
+
+  if which lmdeploy >/dev/null; then
+    echo "Container: lmdeploy"
+    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+    return
+  fi
+
+  if [ -e /tgi-entrypoint.sh ]; then
+    echo "Container: tgi"
+    export CURRENT_LLM_SERVING_ENGINE=tgi
+    return
+  fi
+
+  if which trtllm-build >/dev/null; then
+    echo "Container: tensorrt-llm"
+    export CURRENT_LLM_SERVING_ENGINE=trt
+    return
+  fi
+
+  if [ -e /sgl-workspace ]; then
+    echo "Container: sglang"
+    export CURRENT_LLM_SERVING_ENGINE=sglang
+    return
+  fi
+
+  if [ -e /vllm-workspace ]; then
+    echo "Container: vllm"
+    # move to a completely irrelevant directory, to avoid import vllm from current folder
+    export CURRENT_LLM_SERVING_ENGINE=vllm
+    
+    return
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+kill_gpu_processes() {
+  pkill -f python
+  pkill -f python3
+  pkill -f tritonserver
+  pkill -f pt_main_thread
+  pkill -f text-generation
+  pkill -f lmdeploy
+
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+ensure_installed() {
+  # Ensure that the given command is installed by apt-get
+  local cmd=$1
+  if ! which "$cmd" >/dev/null; then
+    apt-get update && apt-get install -y "$cmd"
+  fi
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    # this is required for lmdeploy.
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+
+
+    # change model name for lmdeploy (it will not follow standard hf name)
+    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
+      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      backend=$CURRENT_LLM_SERVING_ENGINE
+
+      if [[ $backend = "trt" ]]; then
+        backend="tensorrt-llm"
+      fi
+
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+
+      if [[ "$dataset_name" = "sharegpt" ]]; then
+
+        client_command="python3 benchmark_serving.py \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      elif [[ "$dataset_name" = "sonnet" ]]; then
+
+        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
+        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
+        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
+
+        client_command="python3 benchmark_serving.py \
+          --backend $backend \
+          --tokenizer /tokenizer_cache \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --num-prompts $num_prompts \
+          --sonnet-input-len $sonnet_input_len \
+          --sonnet-output-len $sonnet_output_len \
+          --sonnet-prefix-len $sonnet_prefix_len \
+          --port $port \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --ignore-eos \
+          $client_args"
+
+      else
+  
+        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
+        exit 1
+
+      fi
+
+        
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command="None"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+  done
+
+  kill_gpu_processes
+}
+
+
+prepare_dataset() {
+
+  # download sharegpt dataset
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  
+}
+
+main() {
+
+  # check if the environment variable is successfully injected from yaml
+
+  check_gpus
+  check_hf_token
+  get_current_llm_serving_engine
+
+  pip install -U transformers
+
+  # check storage
+  df -h
+
+  ensure_installed wget
+  ensure_installed curl
+  ensure_installed jq
+
+  prepare_dataset
+
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
+
+  # run the test
+  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
+
+  # upload benchmark results to buildkite
+  python3 -m pip install tabulate pandas
+  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -17,7 +17,7 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }

@ -93,7 +93,7 @@ kill_gpu_processes() {


  # wait until GPU memory usage smaller than 1GB
-  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
    sleep 1
  done

@ -117,7 +117,7 @@ upload_to_buildkite() {
  fi

  # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

@ -150,7 +150,7 @@ run_latency_tests() {
    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi

@ -206,9 +206,9 @@ run_throughput_tests() {
    throughput_args=$(json2args "$throughput_params")

    # check if there is enough GPU to run the test
-    tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
+    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi

@ -270,7 +270,7 @@ run_serving_tests() {
    # check if there is enough GPU to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi

@ -278,7 +278,7 @@ run_serving_tests() {
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
-      echo "Server model and client model must be the same. Skip testcase $testname."
+      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi

@ -293,8 +293,7 @@ run_serving_tests() {
    server_pid=$!

    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
+    if wait_for_server; then
      echo ""
      echo "vllm server is up and running."
    else
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@ -1,216 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  pkill text-generation || true
-  # waiting for GPU processes to be fully killed
-  sleep 10
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/generate_stream > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # append tgi to the test name
-    test_name=tgi_$test_name
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
-    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
-      echo "Key 'fp8' exists in common params."
-      server_command="/tgi-entrypoint.sh \
-        --model-id $model \
-        --num-shard $tp \
-        --port $port \
-        --quantize fp8 \
-        $server_args"
-    else
-      echo "Key 'fp8' does not exist in common params."
-      server_command="/tgi-entrypoint.sh \
-        --model-id $model \
-        --num-shard $tp \
-        --port $port \
-        $server_args"
-    fi
-
-
-    
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    eval "$server_command" &
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "tgi server is up and running."
-    else
-      echo ""
-      echo "tgi failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend tgi \
-        --model $model \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "tgi" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-main() {
-
-  check_gpus
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  export CURRENT_LLM_SERVING_ENGINE=tgi
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  python -m pip install tabulate pandas
-  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@ -1,214 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  pkill tritonserver || true
-  # waiting for GPU processes to be fully killed
-  sleep 20
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/generate_stream > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # append trt to the test name
-    test_name=trt_$test_name
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
-    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-
-
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-
-
-    echo "Running test case $test_name"
-    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "trt server is up and running."
-    else
-      echo ""
-      echo "trt failed to start within the timeout period."
-      break
-    fi
-
-    # prepare tokenizer
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend tensorrt-llm \
-        --tokenizer /tokenizer_cache \
-        --model $model \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      server_command=""
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "trt" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-main() {
-
-  check_gpus
-
-
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  # update transformers package, to make sure mixtral tokenizer is available
-  python -m pip install transformers -U
-
-  export CURRENT_LLM_SERVING_ENGINE=trt
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  python -m pip install tabulate pandas
-  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@ -1,221 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  # kill all processes on GPU.
-  pkill pt_main_thread
-  sleep 10
-
-  # remove vllm config file
-  rm -rf ~/.config/vllm
-
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # append vllm to the test name
-    test_name=vllm_$test_name
-
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
-    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
-      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
-      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-      server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
-        -tp $tp \
-        --model $model \
-        --port $port \
-        $server_args"
-    else
-      echo "Key 'fp8' does not exist in common params."
-      server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
-        -tp $tp \
-        --model $model \
-        --port $port \
-        $server_args"
-    fi
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    eval "$server_command" &
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "vllm server is up and running."
-    else
-      echo ""
-      echo "vllm failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend vllm \
-        --model $model \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "vllm" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-main() {
-
-  check_gpus
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  export CURRENT_LLM_SERVING_ENGINE=vllm
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-
-  python3 -m pip install tabulate pandas
-  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -17,10 +17,17 @@ serving_column_mapping = {
    "request_throughput": "Tput (req/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "std_ttft_ms": "Std TTFT (ms)",
+    "median_ttft_ms": "Median TTFT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "std_itl_ms": "Std ITL (ms)",
-    "input_throughput": "Input Tput (tok/s)",
+    "median_itl_ms": "Median ITL (ms)",
+    "mean_tpot_ms": "Mean TPOT (ms)",
+    "std_tpot_ms": "Std TPOT (ms)",
+    "median_tpot_ms": "Median TPOT (ms)",
+    "total_token_throughput": "Total Token Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
+    "total_input_tokens": "Total input tokens",
+    "total_output_tokens": "Total output tokens",
    "engine": "Engine",
 }

@ -29,11 +36,11 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):

-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())

        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
+        with open(test_file.with_suffix(".commands")) as f:
            command = json.loads(f.read())
        raw_result.update(command)

--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -2,9 +2,11 @@
 TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
 URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"

+TIMEOUT_SECONDS=10
+
 retries=0
 while [ $retries -lt 1000 ]; do
-    if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
        exit 0
    fi

@ -14,4 +16,4 @@ while [ $retries -lt 1000 ]; do
    sleep 5
 done

-exit 1
+exit 1
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@ -1,16 +1,18 @@
 [
    {
-        "test_name": "llama8B_tp1",
-        "qps_list": [4],
+        "test_name": "llama8B_tp1_sharegpt",
+        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000
+            "port": 8000,
+            "reuse_server": false
        },
        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -21,34 +23,158 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
+        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": ""
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
        }
    },
    {
-        "test_name": "llama70B_tp4",
-        "qps_list": [2],
+        "test_name": "llama8B_tp1_sonnet_512_16",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 16,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama8B_tp1_sonnet_512_256",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 256,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sharegpt",
+        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000
+            "port": 8000,
+            "reuse_server": false
        },
        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -59,34 +185,50 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
+        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": ""
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
        }
    },
    {
-        "test_name": "mixtral8x7B_tp2",
-        "qps_list": [2],
+        "test_name": "llama70B_tp4_sonnet_512_16",
+        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tp": 2,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
-            "port": 8000
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 16,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -97,20 +239,85 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
+        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": ""
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sonnet_512_256",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 256,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        }, 
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -3,30 +3,26 @@ steps:
    agents:
      queue: cpu_queue
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  - block: "Build CUDA 11.8 wheel"
-    key: block-build-cu118-wheel
-  
+  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
+  # However, this block can be uncommented to save some compute hours.
+  # - block: "Build CUDA 11.8 wheel"
+  #   key: block-build-cu118-wheel
+
  - label: "Build wheel - CUDA 11.8"
-    depends_on: block-build-cu118-wheel
+    # depends_on: block-build-cu118-wheel
    agents:
      queue: cpu_queue
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail

@ -31,8 +33,8 @@ cleanup_docker() {
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
-    # Remove unused volumes
-    docker volume prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
@ -57,17 +59,17 @@ done
 echo "--- Pulling container" 
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-docker pull ${image_name}
+docker pull "${image_name}"

 remove_docker_container() {
-   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT

 echo "--- Running container"

 HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p ${HF_CACHE}
+mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

 commands=$@
@ -83,6 +85,7 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_encoder_decoder_attn.py \
  --ignore=kernels/test_flash_attn.py \
  --ignore=kernels/test_flashinfer.py \
+  --ignore=kernels/test_gguf.py \
  --ignore=kernels/test_int8_quant.py \
  --ignore=kernels/test_machete_gemm.py \
  --ignore=kernels/test_mamba_ssm.py \
@ -93,38 +96,49 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_sampler.py"
 fi

+#ignore certain Entrypoints tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_accuracy.py \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_encoder_decoder.py \
+  --ignore=entrypoints/openai/test_embedding.py \
+  --ignore=entrypoints/openai/test_oot_registration.py "}
+fi
+
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used   
+  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
-    #replace shard arguments
-    commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
-    commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
-    echo "Shard ${GPU} commands:$commands"
+    # assign shard-id for each shard
+    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    echo "Shard ${GPU} commands:$commands_gpu"
    docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
        --shm-size=16gb \
        --rm \
-        -e HIP_VISIBLE_DEVICES=${GPU} \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
-        -v ${HF_CACHE}:${HF_MOUNT} \
-        -e HF_HOME=${HF_MOUNT} \
-        --name ${container_name}_${GPU}  \
-        ${image_name} \
-        /bin/bash -c "${commands}" \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
    PIDS+=($!)
  done
  #wait for all processes to finish and collect exit codes
-  for pid in ${PIDS[@]}; do
-    wait ${pid}
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
    STATUS+=($?)
  done
-  for st in ${STATUS[@]}; do
+  for st in "${STATUS[@]}"; do
    if [[ ${st} -ne 0 ]]; then
      echo "One of the processes failed with $st"
-      exit ${st}
+      exit "${st}"
    fi
  done
 else
@ -135,9 +149,9 @@ else
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
          -e HF_TOKEN \
-          -v ${HF_CACHE}:${HF_MOUNT} \
-          -e HF_HOME=${HF_MOUNT} \
-          --name ${container_name} \
-          ${image_name} \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          --name "${container_name}" \
+          "${image_name}" \
          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script is run by buildkite to run the benchmarks and upload the results to buildkite

 set -ex
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
@ -13,21 +15,38 @@ remove_docker_container
 # Run the image, setting --shm-size=4g for tensor parallel.
 source /etc/environment
 #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test

-# Run basic model test
-docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+function cpu_tests() {
+  set -e

-# online inference
-docker exec cpu-test bash -c "
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
-  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-  python3 benchmarks/benchmark_serving.py \
-    --backend vllm \
-    --dataset-name random \
-    --model facebook/opt-125m \
-    --num-prompts 20 \
-    --endpoint /v1/completions \
-    --tokenizer facebook/opt-125m"
+  # Run basic model test
+  docker exec cpu-test bash -c "
+    set -e
+    pip install pytest pytest-asyncio \
+      decord einops librosa peft Pillow sentence-transformers soundfile \
+      transformers_stream_generator matplotlib datamodel_code_generator
+    pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+    pytest -v -s tests/models/decoder_only/language -m cpu_model
+    pytest -v -s tests/models/embedding/language -m cpu_model
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
+    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+
+  # online inference
+  docker exec cpu-test bash -c "
+    set -e
+    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+    python3 benchmarks/benchmark_serving.py \
+      --backend vllm \
+      --dataset-name random \
+      --model facebook/opt-125m \
+      --num-prompts 20 \
+      --endpoint /v1/completions \
+      --tokenizer facebook/opt-125m"
+}
+
+# All of CPU tests are expected to be finished less than 25 mins.
+export -f cpu_tests
+timeout 25m bash -c "cpu_tests"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -1,10 +1,16 @@
+#!/bin/bash
+
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex

+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-48-95}
+NUMA_NODE=${NUMA_NODE:-1}
+
 # Try building the docker image
-numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .

 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
@ -12,38 +18,61 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2

-# offline inference
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
+function cpu_tests() {
+  set -e

-# Run basic model test
-docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models/decoder_only/language \
-    --ignore=tests/models/test_fp8.py \
-    --ignore=tests/models/decoder_only/language/test_jamba.py \
-    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  # offline inference
+  docker exec cpu-test-avx2 bash -c "
+    set -e
+    python3 examples/offline_inference.py"

-# Run compressed-tensor test
-docker exec cpu-test bash -c "
-  pytest -s -v \
-  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
-  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
+  # Run basic model test
+  docker exec cpu-test bash -c "
+    set -e
+    pip install pytest pytest-asyncio \
+      decord einops librosa peft Pillow sentence-transformers soundfile \
+      transformers_stream_generator matplotlib datamodel_code_generator
+    pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+    pytest -v -s tests/models/decoder_only/language -m cpu_model
+    pytest -v -s tests/models/embedding/language -m cpu_model
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
+    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

-# online inference
-docker exec cpu-test bash -c "
-  export VLLM_CPU_KVCACHE_SPACE=10 
-  export VLLM_CPU_OMP_THREADS_BIND=48-92 
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
-  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-  python3 benchmarks/benchmark_serving.py \
-    --backend vllm \
-    --dataset-name random \
-    --model facebook/opt-125m \
-    --num-prompts 20 \
-    --endpoint /v1/completions \
-    --tokenizer facebook/opt-125m"
+  # Run compressed-tensor test
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -s -v \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
+
+  # Run AWQ test
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -s -v \
+    tests/quantization/test_ipex_quant.py"
+
+  # online inference
+  docker exec cpu-test bash -c "
+    set -e
+    export VLLM_CPU_KVCACHE_SPACE=10 
+    export VLLM_CPU_OMP_THREADS_BIND=$1
+    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
+    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+    python3 benchmarks/benchmark_serving.py \
+      --backend vllm \
+      --dataset-name random \
+      --model facebook/opt-125m \
+      --num-prompts 20 \
+      --endpoint /v1/completions \
+      --tokenizer facebook/opt-125m"
+}
+
+# All of CPU tests are expected to be finished less than 25 mins.
+export -f cpu_tests
+timeout 25m bash -c "cpu_tests $CORE_RANGE"
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t hpu-test-env -f Dockerfile.hpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f hpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@ -14,7 +14,7 @@ DOCKER_IMAGE=$4

 shift 4
 COMMANDS=("$@")
-if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
+if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
    echo "The number of commands must be equal to the number of nodes."
    echo "Number of nodes: $NUM_NODES"
    echo "Number of commands: ${#COMMANDS[@]}"
@ -23,7 +23,7 @@ fi

 echo "List of commands"
 for command in "${COMMANDS[@]}"; do
-    echo $command
+    echo "$command"
 done

 start_network() {
@ -36,7 +36,7 @@ start_nodes() {
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
@ -49,17 +49,20 @@ start_nodes() {
        # 3. map the huggingface cache directory to the container
        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
        #    starting from 192.168.10.11)
-        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
+            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
+            /bin/bash -c "tail -f /dev/null"

        # organize containers into a ray cluster
-        if [ $node -eq 0 ]; then
+        if [ "$node" -eq 0 ]; then
            # start the ray head node
-            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
            # wait for the head node to be ready
            sleep 10
        else
            # start the ray worker nodes, and connect them to the head node
-            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
        fi
    done

@ -79,22 +82,22 @@ run_nodes() {
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        echo "Running node$node with GPU devices: $GPU_DEVICES"
-        if [ $node -ne 0 ]; then
-            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        if [ "$node" -ne 0 ]; then
+            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        else
-            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        fi
    done
 }
 cleanup() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
-        docker stop node$node
+        docker stop "node$node"
    done
    docker network rm docker-net
 }
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
        docker system prune -f
-        echo $current_time > /tmp/neuron-docker-build-timestamp
+        echo "$current_time" > /tmp/neuron-docker-build-timestamp
    fi
 else
-    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi

 docker build -t neuron -f Dockerfile.neuron .
@ -34,7 +36,7 @@ wait_for_server_to_start() {
    timeout=300
    counter=0

-    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+    while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
        sleep 1
        counter=$((counter + 1))
        if [ $counter -ge $timeout ]; then
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the OpenVINO docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
@ -11,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e

 # Build the docker image.
@ -12,4 +14,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
@ -11,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
+docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -9,6 +9,8 @@
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
+# nightly(bool): run this test in nightly pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually)
 # command(str): the single command to run for tests. incompatible with commands.
 # commands(list): the list of commands to run for test. incompatbile with command.
 # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@ -39,17 +41,19 @@ steps:
  # Check API reference (if it fails, you may have missing mock imports)
  - grep \"sig sig-object py\" build/html/dev/sampling_params.html

- label: Async Engine, Inputs, Utils, Worker Test # 15min
+- label: Async Engine, Inputs, Utils, Worker Test # 24min
  fast_check: true
  source_file_dependencies:
  - vllm/
+  - tests/mq_llm_engine
  - tests/async_engine
  - tests/test_inputs
  - tests/multimodal
  - tests/test_utils
  - tests/worker
  commands:
-  - pytest -v -s async_engine # Async Engine
+  - pytest -v -s mq_llm_engine # MQLLMEngine
+  - pytest -v -s async_engine # AsyncLLMEngine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
  - pytest -v -s test_inputs.py
  - pytest -v -s multimodal
@ -61,14 +65,22 @@ steps:
  fast_check: true
  source_file_dependencies:
  - vllm/
-  - tests/basic_correctness
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_preemption
  commands:
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+
+- label: Chunked Prefill Test
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_chunked_prefill
+  commands:
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-  
+
 - label: Core Test # 10min
  mirror_hardwares: [amd]
  fast_check: true
@ -79,18 +91,21 @@ steps:
  commands:
  - pytest -v -s core

- label: Entrypoints Test # 20min
+- label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  commands:
  - pip install -e ./plugins/vllm_add_dummy_model
-  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
-  - pytest -v -s entrypoints/openai
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s entrypoints/test_chat_utils.py
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

@ -103,7 +118,10 @@ steps:
  - vllm/core/
  - tests/distributed
  - tests/spec_decode/e2e/test_integration_dist_tp4
+  - tests/compile
  commands:
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py

@ -131,7 +149,9 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/test_regression
-  command: pytest -v -s test_regression.py
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

 - label: Engine Test # 10min
@ -145,7 +165,15 @@ steps:
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization

- label: Examples Test # 12min
+- label: V1 Test
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1
+
+- label: Examples Test # 15min
  working_dir: "/vllm-workspace/examples"
  #mirror_hardwares: [amd]
  source_file_dependencies:
@ -162,15 +190,9 @@ steps:
    - python3 offline_inference_vision_language_multi_image.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference_encoder_decoder.py
+    - python3 offline_profile.py --model facebook/opt-125m

- label: torch compile integration test
-  source_file_dependencies:
-  - vllm/
-  commands:
-    - pytest -v -s ./compile/test_full_graph.py
-    - pytest -v -s ./compile/test_wrapper.py
-
- label: Prefix Caching Test # 7min
+- label: Prefix Caching Test # 9min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
@ -178,7 +200,7 @@ steps:
  commands:
    - pytest -v -s prefix_caching

- label: Samplers Test # 18min
+- label: Samplers Test # 36min
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
@ -194,17 +216,15 @@ steps:
  - tests/test_logits_processor
  command: pytest -v -s test_logits_processor.py

- label: Speculative decoding tests # 22min
+- label: Speculative decoding tests # 30min
  source_file_dependencies:
  - vllm/spec_decode
  - tests/spec_decode
  commands:
-    # See https://github.com/vllm-project/vllm/issues/5152
-    - export VLLM_ATTENTION_BACKEND=XFORMERS
    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
+    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py

- label: LoRA Test %N # 30min each
+- label: LoRA Test %N # 15min each
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
@ -212,7 +232,25 @@ steps:
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
  parallelism: 4

- label: Kernels Test %N # 30min each
+- label: "PyTorch Fullgraph Smoke Test" # 9min
+  fast_check: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_basic_correctness.py
+  # these tests need to be separated, cannot combine
+  - pytest -v -s compile/piecewise/test_simple.py
+  - pytest -v -s compile/piecewise/test_toy_llama.py
+
+- label: "PyTorch Fullgraph Test" # 18min
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph.py
+
+- label: Kernels Test %N # 1h each
  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
@ -239,15 +277,14 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
-  - pip install aiohttp
  - bash run-benchmarks.sh

- label: Quantization Test # 15min
+- label: Quantization Test # 33min
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
-  command: pytest -v -s quantization
+  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@ -255,10 +292,16 @@ steps:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-small.txt -t 1

+- label: Encoder Decoder tests # 5min
+  source_file_dependencies:
+  - vllm/
+  - tests/encoder_decoder
+  commands:
+    - pytest -v -s encoder_decoder
+
 - label: OpenAI-Compatible Tool Use # 20 min
  fast_check: false
  mirror_hardwares: [ amd ]
@ -270,42 +313,79 @@ steps:

 #####  models test  #####

- label: Basic Models Test # 3min
+- label: Basic Models Test # 30min
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pip install -e ./plugins/vllm_add_dummy_model
    - pytest -v -s models/test_oot_registration.py # it needs a clean process
-    - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
+    - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_initialization.py

- label: Decoder-only Language Models Test # 1h3min
+- label: Language Models Test (Standard) # 42min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
  commands:
-    - pytest -v -s models/decoder_only/language
+    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/language -m core_model
+    - pytest -v -s models/embedding/vision_language -m core_model

- label: Decoder-only Multi-Modal Models Test # 56min
+- label: Language Models Test (Extended) # 50min
+  nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
+  commands:
+    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/language -m 'not core_model'
+    - pytest -v -s models/embedding/vision_language -m 'not core_model'
+
+- label: Multi-Modal Models Test (Standard) # 26min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/audio_language
  - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/vision_language
  commands:
-    - pytest -v -s models/decoder_only/audio_language
-    - pytest -v -s models/decoder_only/vision_language
+    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/encoder_decoder/language -m core_model
+    - pytest -v -s models/encoder_decoder/vision_language -m core_model

- label: Other Models Test # 5min
-  #mirror_hardwares: [amd]
+- label: Multi-Modal Models Test (Extended) # 1h15m
+  nightly: true
  source_file_dependencies:
  - vllm/
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/vision_language
  commands:
-    - pytest -v -s models/embedding/language
-    - pytest -v -s models/encoder_decoder/language
+    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    # HACK - run phi3v tests separately to sidestep this transformers bug
+    # https://github.com/huggingface/transformers/issues/34307
+    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
+    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models Test
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -338,7 +418,7 @@ steps:
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'

- label: Distributed Tests (2 GPUs) # 28min
+- label: Distributed Tests (2 GPUs) # 40min
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -348,18 +428,22 @@ steps:
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
+  - vllm/compilation
  commands:
+  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
  # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

- label: Multi-step Tests (4 GPUs) # 21min
+- label: Multi-step Tests (4 GPUs) # 36min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -377,7 +461,7 @@ steps:
  - pytest -v -s multi_step/test_correctness_async_llm.py
  - pytest -v -s multi_step/test_correctness_llm.py

- label: Pipeline Parallelism Test # 23min
+- label: Pipeline Parallelism Test # 45min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -403,7 +487,7 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s -x lora/test_long_context.py

- label: Weight Loading Multiple GPU Test
+- label: Weight Loading Multiple GPU Test  # 33min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -436,7 +520,8 @@ steps:
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
-  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
  - pytest -v -s -x lora/test_mixtral.py

 - label: LM Eval Large Models # optional
@ -447,6 +532,5 @@ steps:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-large.txt -t 4
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# Assume wheels are in artifacts/dist/*.whl
+wheel_files=(artifacts/dist/*.whl)
+
+# Check that exactly one wheel is found
+if [[ ${#wheel_files[@]} -ne 1 ]]; then
+  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
+  exit 1
+fi
+
+# Get the single wheel file
+wheel="${wheel_files[0]}"
+
+# Rename 'linux' to 'manylinux1' in the wheel filename
+new_wheel="${wheel/linux/manylinux1}"
+mv -- "$wheel" "$new_wheel"
+wheel="$new_wheel"
+
+# Extract the version from the wheel
+version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+echo "Version: $version"
+
+# If the version contains "dev", rename it to v1.0.0.dev for consistency
+if [[ $version == *dev* ]]; then
+    new_version="1.0.0.dev"
+    new_wheel="${wheel/$version/$new_version}"
+    mv -- "$wheel" "$new_wheel"
+    wheel="$new_wheel"
+    version="$new_version"
+fi
+
+# Upload the wheel to S3
+aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.dockerignore
+++ b/.dockerignore
@ -1,4 +1,33 @@
-vllm/*.so
 /.venv
 /build
 dist
+vllm/*.so
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+.mypy_cache
+
+# Distribution / packaging
+.Python
+/build/
+cmake-build-*/
+CMakeUserPresets.json
+develop-eggs/
+/dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -0,0 +1,30 @@
+# See https://help.github.com/articles/about-codeowners/
+# for more info about CODEOWNERS file
+
+# This lists cover the "core" components of vLLM that require careful review
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+CMakeLists.txt @tlrmchlsmth @WoosukKwon
+
+# Test ownership
+/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
+/tests/test_inputs.py @DarkLight1337 @ywang96
+/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
+/tests/models @DarkLight1337 @ywang96
+/tests/multimodal @DarkLight1337 @ywang96
+/tests/prefix_caching @comaniac @KuntaiDu
+/tests/spec_decode @njhill @LiuXiaoxuanPKU
+/tests/kernels @tlrmchlsmth @WoosukKwon
+/tests/quantization @mgoin @robertgshaw2-neuralmagic
+/.buildkite/lm-eval-harness @mgoin @simon-mo
+/tests/distributed/test_multi_node_assignment.py @youkaichao
+/tests/distributed/test_pipeline_parallel.py @youkaichao
+/tests/distributed/test_same_node.py @youkaichao
+/tests/multi_step @alexm-neuralmagic @comaniac
+/tests/weight_loading @mgoin @youkaichao
+/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,32 @@
+version: 2
+updates:
+  # Maintain dependencies for GitHub Actions
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    labels: ["dependencies"]
+    open-pull-requests-limit: 5
+    reviewers: ["khluu", "simon-mo"]
+    allow:
+      - dependency-type: "all"
+    ignore:
+      - dependency-name: "torch"
+      - dependency-name: "torchvision"
+      - dependency-name: "xformers"
+      - dependency-name: "lm-format-enforcer"
+      - dependency-name: "gguf"
+      - dependency-name: "compressed-tensors"
+      - dependency-name: "ray[adag]"
+      - dependency-name: "lm-eval"
+    groups:
+      patch-update:
+        applies-to: version-updates
+        update-types: ["patch"]
+      minor-update:
+        applies-to: version-updates
+        update-types: ["minor"]
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -0,0 +1,60 @@
+pull_request_rules:
+- name: label-documentation
+  description: Automatically apply documentation label
+  conditions:
+    - or:
+      - files~=^[^/]+\.md$
+      - files~=^docs/
+  actions:
+    label:
+      add:
+        - documentation
+
+- name: label-ci-build
+  description: Automatically apply ci/build label
+  conditions:
+    - or:
+      - files~=^\.github/
+      - files~=\.buildkite/
+      - files~=^cmake/
+      - files=CMakeLists.txt
+      - files~=^Dockerfile
+      - files~=^requirements.*\.txt
+      - files=setup.py
+  actions:
+    label:
+      add:
+        - ci/build
+
+- name: label-frontend
+  description: Automatically apply frontend label
+  conditions:
+    - files~=^vllm/entrypoints/
+  actions:
+    label:
+      add:
+        - frontend
+
+- name: ping author on conflicts and add 'needs-rebase' label
+  conditions:
+      - conflict
+      - -closed
+  actions:
+    label:
+      add:
+        - needs-rebase
+    comment:
+      message: |
+       This pull request has merge conflicts that must be resolved before it can be
+       merged. Please rebase the PR, @{{author}}.
+
+       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+
+- name: remove 'needs-rebase' label when conflict is resolved
+  conditions:
+      - -conflict
+      - -closed
+  actions:
+    label:
+      remove:
+        - needs-rebase
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -eu
+
+# ensure 1 argument is passed
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <pr_number>"
+    exit 1
+fi
+
+PR_NUMBER=$1
+OLD=/tmp/orig_pr_body.txt
+NEW=/tmp/new_pr_body.txt
+
+gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
+cp "${OLD}" "${NEW}"
+
+# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
+sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}"
+
+# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
+sed -i '/FIX #xxxx.*$/d' "${NEW}"
+
+# Remove "FILL IN THE PR DESCRIPTION HERE"
+sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+
+# Run this only if ${NEW} is different than ${OLD}
+if ! cmp -s "${OLD}" "${NEW}"; then
+    echo "Updating PR body"
+    gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
+else
+    echo "No changes needed"
+fi
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@ -0,0 +1,40 @@
+name: Lint GitHub Actions workflows
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: "Run actionlint"
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+          tools/actionlint.sh -color
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -8,7 +8,7 @@ jobs:
        runs-on: ubuntu-latest
        steps:
            -   name: Add label
-                uses: actions/github-script@v5
+                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
                with:
                    script: |
                        github.rest.issues.addLabels({
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@ -6,9 +6,21 @@ on:
  push:
    branches:
      - main
+    paths:
+      - '**/*.h'
+      - '**/*.cpp'
+      - '**/*.cu'
+      - '**/*.cuh'
+      - '.github/workflows/clang-format.yml'
  pull_request:
    branches:
      - main
+    paths:
+      - '**/*.h'
+      - '**/*.cpp'
+      - '**/*.cu'
+      - '**/*.cuh'
+      - '.github/workflows/clang-format.yml'

 jobs:
  clang-format:
@ -17,9 +29,9 @@ jobs:
      matrix:
        python-version: ["3.11"]
    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
@ -38,4 +50,4 @@ jobs:
        )
        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
-            | xargs clang-format --dry-run --Werror
+            | xargs clang-format --dry-run --Werror
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -0,0 +1,26 @@
+name: Cleanup PR Body
+
+on:
+  pull_request_target:
+    types: [opened, reopened, edited]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  update-description:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Set up Python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: '3.12'
+
+      - name: Update PR description
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@ -0,0 +1,45 @@
+name: codespell
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - "**/*.md"
+      - "**/*.rst"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/codespell.yml
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - "**/*.md"
+      - "**/*.rst"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/codespell.yml
+
+jobs:
+  codespell:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements-lint.txt
+    - name: Spelling check with codespell
+      run: |
+        codespell --toml pyproject.toml
--- a/.github/workflows/matchers/actionlint.json
+++ b/.github/workflows/matchers/actionlint.json
@ -0,0 +1,17 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "actionlint",
+      "pattern": [
+        {
+          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
+          "file": 1,
+          "line": 2,
+          "column": 3,
+          "message": 4,
+          "code": 5
+        }
+      ]
+    }
+  ]
+}
--- a/.github/workflows/matchers/mypy.json
+++ b/.github/workflows/matchers/mypy.json
@ -0,0 +1,16 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "mypy",
+      "pattern": [
+        {
+          "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
+          "file": 1,
+          "line": 2,
+          "severity": 3,
+          "message": 4
+        }
+      ]
+    }
+  ]
+}
--- a/.github/workflows/matchers/ruff.json
+++ b/.github/workflows/matchers/ruff.json
@ -0,0 +1,17 @@
+{
+    "problemMatcher": [
+      {
+        "owner": "ruff",
+        "pattern": [
+          {
+            "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
+            "file": 1,
+            "line": 2,
+            "column": 3,
+            "code": 4,
+            "message": 5
+          }
+        ]
+      }
+    ]
+  }
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -6,20 +6,35 @@ on:
  push:
    branches:
      - main
+    paths:
+      - '**/*.py'
+      - '.github/workflows/mypy.yaml'
+      - 'tools/mypy.sh'
+      - 'pyproject.toml'
  pull_request:
    branches:
      - main
+    # This workflow is only relevant when one of the following files changes.
+    # However, we have github configured to expect and require this workflow
+    # to run and pass before github with auto-merge a pull request. Until github
+    # allows more flexible auto-merge policy, we can just run this on every PR.
+    # It doesn't take that long to run, anyway.
+    #paths:
+    #  - '**/*.py'
+    #  - '.github/workflows/mypy.yaml'
+    #  - 'tools/mypy.sh'
+    #  - 'pyproject.toml'

 jobs:
-  ruff:
+  mypy:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
@ -32,15 +47,5 @@ jobs:
        pip install types-setuptools
    - name: Mypy
      run: |
-        mypy
-        mypy tests --follow-imports skip
-        mypy vllm/attention --follow-imports skip
-        mypy vllm/distributed --follow-imports skip
-        mypy vllm/engine  --follow-imports skip
-        mypy vllm/executor --follow-imports skip
-        mypy vllm/lora --follow-imports skip
-        mypy vllm/model_executor  --follow-imports skip
-        mypy vllm/prompt_adapter --follow-imports skip
-        mypy vllm/spec_decode --follow-imports skip
-        mypy vllm/worker --follow-imports skip
-
+        echo "::add-matcher::.github/workflows/matchers/mypy.json"
+        tools/mypy.sh 1 ${{ matrix.python-version }}
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -21,16 +21,16 @@ jobs:
      upload_url: ${{ steps.create_release.outputs.upload_url }}
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Extract branch info
        shell: bash
        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"

      - name: Create Release
        id: create_release
-        uses: "actions/github-script@v6"
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        env:
          RELEASE_TAG: ${{ env.release_tag }}
        with:
@ -48,16 +48,16 @@ jobs:
      fail-fast: false
      matrix:
          os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python-version: ['3.9', '3.10', '3.11', '3.12']
          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
          cuda-version: ['11.8', '12.1']

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@v1.2
+        uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
        with:
          create-symlink: true
          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
@ -68,7 +68,7 @@ jobs:
          bash -x .github/workflows/scripts/env.sh

      - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
        with:
            python-version: ${{ matrix.python-version }}

@ -86,13 +86,13 @@ jobs:
          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
        run: |
          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+          wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
          asset_name=${wheel_name//"linux"/"manylinux1"}
-          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
-          echo "asset_name=${asset_name}" >> $GITHUB_ENV
+          echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+          echo "asset_name=${asset_name}" >> "$GITHUB_ENV"

      - name: Upload Release Asset
-        uses: actions/upload-release-asset@v1
+        uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -8,7 +8,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Remind to run full CI on PR
-        uses: actions/github-script@v6
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        with:
          script: |
            github.rest.issues.createComment({
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -6,32 +6,47 @@ on:
  push:
    branches:
      - main
+    paths:
+      - "**/*.py"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/matchers/ruff.json
+      - .github/workflows/ruff.yml
  pull_request:
    branches:
      - main
+    # This workflow is only relevant when one of the following files changes.
+    # However, we have github configured to expect and require this workflow
+    # to run and pass before github with auto-merge a pull request. Until github
+    # allows more flexible auto-merge policy, we can just run this on every PR.
+    # It doesn't take that long to run, anyway.
+    #paths:
+    #  - "**/*.py"
+    #  - pyproject.toml
+    #  - requirements-lint.txt
+    #  - .github/workflows/matchers/ruff.json
+    #  - .github/workflows/ruff.yml

 jobs:
  ruff:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.12"]
    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
-    - name: Analysing the code with ruff
-      run: |
-        ruff .
-    - name: Spelling check with codespell
-      run: |
-        codespell --toml pyproject.toml
-    - name: Run isort
-      run: |
-        isort . --check-only
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-lint.txt
+      - name: Analysing the code with ruff
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/ruff.json"
+          ruff check --output-format github .
+      - name: Run isort
+        run: |
+          isort . --check-only
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+set -eux

 python_executable=python$1
 cuda_home=/usr/local/cuda-$2
@ -8,12 +9,15 @@ PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH

 # Install requirements
-$python_executable -m pip install wheel packaging
-$python_executable -m pip install -r requirements-cuda.txt
+$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt

 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
+
+bash tools/check_repo.sh
+
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@ -1,16 +1,16 @@
 #!/bin/bash

 # Replace '.' with '-' ex: 11.8 -> 11-8
-cuda_version=$(echo $1 | tr "." "-")
+cuda_version=$(echo "$1" | tr "." "-")
 # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
-OS=$(echo $2 | tr -d ".\-")
+OS=$(echo "$2" | tr -d ".\-")

 # Installs CUDA
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
 sudo dpkg -i cuda-keyring_1.1-1_all.deb
 rm cuda-keyring_1.1-1_all.deb
 sudo apt -qq update
-sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
 sudo apt clean

 # Test nvcc
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@ -6,7 +6,7 @@ cuda_version=$3

 # Install torch
 $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
-$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
+$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"

 # Print version information
 $python_executable --version
--- a/.github/workflows/shellcheck.yml
+++ b/.github/workflows/shellcheck.yml
@ -0,0 +1,37 @@
+name: Lint shell scripts
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '**/*.sh'
+      - '.github/workflows/shellcheck.yml'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '**/*.sh'
+      - '.github/workflows/shellcheck.yml'
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  shellcheck:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: "Check shell scripts"
+        run: |
+          tools/shellcheck.sh
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -0,0 +1,52 @@
+name: 'Close inactive issues and PRs'
+
+on:
+  schedule:
+    # Daily at 1:30 AM UTC
+    - cron: '30 1 * * *'
+
+jobs:
+  close-issues-and-pull-requests:
+    permissions:
+      issues: write
+      pull-requests: write
+      actions: write
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
+        with:
+          # Increasing this value ensures that changes to this workflow
+          # propagate to all issues and PRs in days rather than months
+          operations-per-run: 1000
+
+          exempt-draft-pr: true
+          exempt-issue-labels: 'keep-open'
+          exempt-pr-labels: 'keep-open'
+
+          labels-to-add-when-unstale: 'unstale'
+          labels-to-remove-when-stale: 'unstale'
+
+          days-before-issue-stale: 90
+          days-before-issue-close: 30
+          stale-issue-label: 'stale'
+          stale-issue-message: >
+            This issue has been automatically marked as stale because it has not
+            had any activity within 90 days. It will be automatically closed if no
+            further activity occurs within 30 days. Leave a comment if
+            you feel this issue should remain open. Thank you!
+          close-issue-message: >
+            This issue has been automatically closed due to inactivity. Please
+            feel free to reopen if you feel it is still relevant. Thank you!
+
+          days-before-pr-stale: 90
+          days-before-pr-close: 30
+          stale-pr-label: 'stale'
+          stale-pr-message: >
+            This pull request has been automatically marked as stale because it
+            has not had any activity within 90 days. It will be automatically
+            closed if no further activity occurs within 30 days. Leave a comment
+            if you feel this pull request should remain open. Thank you!
+          close-pr-message: >
+            This pull request has been automatically closed due to inactivity.
+            Please feel free to reopen if you intend to continue working on it.
+            Thank you!
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -6,26 +6,33 @@ on:
  push:
    branches:
      - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
  pull_request:
    branches:
      - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
+
 jobs:
  yapf:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.12"]
    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install yapf==0.32.0
-        pip install toml==0.10.2
-    - name: Running yapf
-      run: |
-        yapf --diff --recursive .
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install yapf==0.32.0
+          pip install toml==0.10.2
+      - name: Running yapf
+        run: |
+          yapf --diff --recursive .
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,8 @@
-# vllm commit id, generated by setup.py
-vllm/commit_id.py
+# version file generated by setuptools-scm
+/vllm/_version.py
+
+# vllm-flash-attn built from source
+vllm/vllm_flash_attn/

 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -12,6 +15,8 @@ __pycache__/
 # Distribution / packaging
 .Python
 build/
+cmake-build-*/
+CMakeUserPresets.json
 develop-eggs/
 dist/
 downloads/
@ -28,6 +33,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+/.deps/

 # PyInstaller
 #  Usually these files are written by a python script from a template
@ -193,3 +199,7 @@ hip_compat.h

 # Benchmark dataset
 benchmarks/*.json
+
+# Linting
+actionlint
+shellcheck*/
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -6,17 +6,16 @@ version: 2
 build:
  os: ubuntu-22.04
  tools:
-    python: "3.8"
+    python: "3.12"

 sphinx:
-   configuration: docs/source/conf.py
-   fail_on_warning: true
+  configuration: docs/source/conf.py
+  fail_on_warning: true

 # If using Sphinx, optionally build your docs in additional formats such as PDF
-formats:
-   - pdf
+formats: []

 # Optionally declare the Python requirements required to build your docs
 python:
-   install:
-   - requirements: docs/requirements-docs.txt
+  install:
+    - requirements: docs/requirements-docs.txt
--- a/.shellcheckrc
+++ b/.shellcheckrc
@ -0,0 +1,9 @@
+# rules currently disabled:
+#
+#   SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
+#   SC2004 (style): $/${} is unnecessary on arithmetic variables.
+#   SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
+#   SC2155 (warning): Declare and assign separately to avoid masking return values.
+#   SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
+#
+disable=SC1091,SC2004,SC2129,SC2155,SC2164
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,16 @@
 cmake_minimum_required(VERSION 3.26)

+# When building directly using CMake, make sure you run the install step
+# (it places the .so files in the correct location).
+#
+# Example:
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
+# cmake --build . --target install
+#
+# If you want to only build one target, make sure to install it manually:
+# cmake --build . --target _C
+# cmake --install . --component _C
 project(vllm_extensions LANGUAGES CXX)

 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
@ -13,17 +24,20 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")

+# Prevent installation of dependencies (cutlass) by default.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")

 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -35,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")

 #
 # Try to find python package with an executable that exactly matches
@ -69,39 +83,6 @@ endif()
 #
 find_package(Torch REQUIRED)

-#
-# Add the `default` target which detects which extensions should be
-# built based on platform/architecture.  This is the same logic that
-# setup.py uses to select which extensions should be built and should
-# be kept in sync.
-#
-# The `default` target makes direct use of cmake easier since knowledge
-# of which extensions are supported has been factored in, e.g.
-#
-# mkdir build && cd build
-# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
-# cmake --build . --target default
-#
-add_custom_target(default)
-message(STATUS "Enabling core extension.")
-
-# Define _core_C extension
-#  built for (almost) every target platform, (excludes TPU and Neuron)
-
-set(VLLM_EXT_SRC
-  "csrc/core/torch_bindings.cpp")
-
-define_gpu_extension_target(
-  _core_C
-  DESTINATION vllm
-  LANGUAGE CXX
-  SOURCES ${VLLM_EXT_SRC}
-  COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-  USE_SABI 3
-  WITH_SOABI)
-
-add_dependencies(default _core_C)
-
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -144,14 +125,32 @@ else()
  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()

-#
-# Override the GPU architectures detected by cmake/torch and filter them by
-# the supported versions for the current language.
-# The final set of arches is stored in `VLLM_GPU_ARCHES`.
-#
-override_gpu_arches(VLLM_GPU_ARCHES
-  ${VLLM_GPU_LANG}
-  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  #
+  # For cuda we want to be able to control which architectures we compile for on
+  # a per-file basis in order to cut down on compile time. So here we extract
+  # the set of architectures we want to compile for and remove the from the
+  # CMAKE_CUDA_FLAGS so that they are not applied globally.
+  #
+  clear_cuda_arches(CUDA_ARCH_FLAGS)
+  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
+  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
+  # Filter the target architectures by the supported supported archs
+  # since for some files we will build for all CUDA_ARCHS.
+  cuda_archs_loose_intersection(CUDA_ARCHS
+    "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
+  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
+else()
+  #
+  # For other GPU targets override the GPU architectures detected by cmake/torch
+  # and filter them by the supported versions for the current language.
+  # The final set of arches is stored in `VLLM_GPU_ARCHES`.
+  #
+  override_gpu_arches(VLLM_GPU_ARCHES
+    ${VLLM_GPU_LANG}
+    "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
+endif()

 #
 # Query torch for additional GPU compilation flags for the given
@ -167,6 +166,17 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()

+
+#
+# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
+# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
+# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
+# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
+#
+include(FetchContent)
+file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
+message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
+
 #
 # Define other extension targets
 #
@ -177,21 +187,25 @@ endif()

 set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
-  "csrc/attention/attention_kernels.cu"
+  "csrc/attention/paged_attention_v1.cu"
+  "csrc/attention/paged_attention_v2.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
+  "csrc/layernorm_quant_kernels.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
  "csrc/cuda_utils_kernels.cu"
-  "csrc/moe_align_block_size_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
  "csrc/torch_bindings.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  include(FetchContent)
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+
+  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
+  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+
  FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
@ -210,29 +224,89 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
-    "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-    "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
-    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
-    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-    "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
    "csrc/quantization/gguf/gguf_kernel.cu"
-    "csrc/quantization/fp8/fp8_marlin.cu"
    "csrc/custom_all_reduce.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    "csrc/permute_cols.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${VLLM_EXT_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
+
+  # Only build Marlin kernels if we are building for at least some compatible archs.
+  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
+  # are not supported by Machete yet.
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
+  if (MARLIN_ARCHS)
+    set(MARLIN_SRCS
+       "csrc/quantization/fp8/fp8_marlin.cu"
+       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
+       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_SRCS}"
+      CUDA_ARCHS "${MARLIN_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
+    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
+  else()
+    message(STATUS "Not building Marlin kernels as no compatible archs found"
+                   " in CUDA target architectures")
+  endif()

  #
-  # The CUTLASS kernels for Hopper require sm90a to be enabled.
-  # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
-  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
-    set_source_files_properties(
-          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
-          PROPERTIES
-          COMPILE_FLAGS
-          "-gencode arch=compute_90a,code=sm_90a")
+  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
+    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Hopper.")
+    else()
+      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
+    # build any 3x kernels
+    set(SCALED_MM_3X_ARCHS)
+  endif()
+
+  #
+  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
+  # kernels for the remaining archs that are not already built for 3x.
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
+    "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+  # subtract out the archs that are already built for 3x
+  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
+  if (SCALED_MM_2X_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
+    message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
+  else()
+    if (SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building scaled_mm_c2x as all archs are already built"
+                     " for and covered by scaled_mm_c3x")
+    else()
+      message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
+                    "in CUDA target architectures")
+    endif()
  endif()


@ -240,49 +314,75 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Machete kernels

  # The machete kernels only work on hopper and require CUDA 12.0 or later.
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+  # Only build Machete kernels if we are building for something compatible with sm90a
+  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
    #
-    # For the Machete kernels we automatically generate sources for various 
+    # For the Machete kernels we automatically generate sources for various
    # preselected input type pairs and schedules.
    # Generate sources:
-    execute_process(
-      COMMAND ${CMAKE_COMMAND} -E env 
-      PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
-        ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
-      RESULT_VARIABLE machete_generation_result
-      OUTPUT_VARIABLE machete_generation_output
-      OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
-      ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
-    )
+    set(MACHETE_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
+    file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)

-    if (NOT machete_generation_result EQUAL 0)
-      message(FATAL_ERROR "Machete generation failed."
-                          " Result: \"${machete_generation_result}\"" 
-                          "\nCheck the log for details: "
-                          "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
+    message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
+        RESULT_VARIABLE machete_generation_result
+        OUTPUT_VARIABLE machete_generation_output
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
+      )
+
+      if (NOT machete_generation_result EQUAL 0)
+        message(FATAL_ERROR "Machete generation failed."
+                            " Result: \"${machete_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
+      else()
+        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run machete generate script hash" FORCE)
+        message(STATUS "Machete generation completed successfully.")
+      endif()
    else()
-      message(STATUS "Machete generation completed successfully.")
+      message(STATUS "Machete generation script has not changed, skipping generation.")
    endif()

    # Add machete generated sources
    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
-    message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")

-    set_source_files_properties(
-          ${MACHETE_GEN_SOURCES}
-          PROPERTIES
-          COMPILE_FLAGS
-          "-gencode arch=compute_90a,code=sm_90a")
+    # forward compatible
+    set_gencode_flags_for_srcs(
+      SRCS "${MACHETE_GEN_SOURCES}"
+      CUDA_ARCHS "${MACHETE_ARCHS}")
+
+    list(APPEND VLLM_EXT_SRC
+      csrc/quantization/machete/machete_pytorch.cu)
+
+    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
+        AND MACHETE_ARCHS)
+      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
+                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                     "later if you intend on running w4a16 quantized models on "
+                     "Hopper.")
+    else()
+      message(STATUS "Not building Machete kernels as no compatible archs "
+                     "found in CUDA target architectures")
+    endif()
  endif()
-
-  # Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
-  #  raise an error if the user that this was built with an incompatible 
-  #  CUDA version)
-  list(APPEND VLLM_EXT_SRC
-    csrc/quantization/machete/machete_pytorch.cu)
+# if CUDA endif
 endif()

+message(STATUS "Enabling C extension.")
 define_gpu_extension_target(
  _C
  DESTINATION vllm
@ -294,8 +394,8 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)

-# If CUTLASS is compiled on NVCC >= 12.5, it by default uses 
-# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the 
+# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
+# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
 # driver API. This causes problems when linking with earlier versions of CUDA.
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
@ -306,13 +406,39 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)

 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
+  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

+set_gencode_flags_for_srcs(
+  SRCS "${VLLM_MOE_EXT_SRC}"
+  CUDA_ARCHS "${CUDA_ARCHS}")
+
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_MOE_EXT_SRC
-      "csrc/moe/marlin_moe_ops.cu")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+  if (MARLIN_MOE_ARCHS)
+    set(MARLIN_MOE_SRC
+        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
+        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
+        "csrc/moe/marlin_moe_ops.cu")
+
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_MOE_SRC}"
+      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+
+    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
+    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
+  else()
+    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
+                   " in CUDA target architectures")
+  endif()
 endif()

+message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
  DESTINATION vllm
@ -323,13 +449,98 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)

+if(VLLM_GPU_LANG STREQUAL "HIP")
+  #
+  # _rocm_C extension
+  #
+  set(VLLM_ROCM_EXT_SRC
+    "csrc/rocm/torch_bindings.cpp"
+    "csrc/rocm/attention.cu")

-
-if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
-  message(STATUS "Enabling C extension.")
-  add_dependencies(default _C)
-
-  message(STATUS "Enabling moe extension.")
-  add_dependencies(default _moe_C)
-
+  define_gpu_extension_target(
+    _rocm_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_ROCM_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_GPU_ARCHES}
+    USE_SABI 3
+    WITH_SOABI)
 endif()
+
+# vllm-flash-attn currently only supported on CUDA
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
+  return()
+endif ()
+
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
+# we need to manually set VLLM_GPU_ARCHES here.
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  foreach(_ARCH ${CUDA_ARCHS})
+    string(REPLACE "." "" _ARCH "${_ARCH}")
+    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
+  endforeach()
+endif()
+
+#
+# Build vLLM flash attention from source
+#
+# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
+# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
+# They should be identical but if they aren't, this is a massive footgun.
+#
+# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
+# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
+# If no component is specified, vllm-flash-attn is still installed.
+
+# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
+# This is to enable local development of vllm-flash-attn within vLLM.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
+  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
+endif()
+
+if(VLLM_FLASH_ATTN_SRC_DIR)
+  FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
+else()
+  FetchContent_Declare(
+          vllm-flash-attn
+          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
+          GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
+          GIT_PROGRESS TRUE
+          # Don't share the vllm-flash-attn build between build types
+          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
+  )
+endif()
+
+# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
+set(VLLM_PARENT_BUILD ON)
+
+# Ensure the vllm/vllm_flash_attn directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
+
+# Make sure vllm-flash-attn install rules are nested under vllm/
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
+install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
+
+# Fetch the vllm-flash-attn library
+FetchContent_MakeAvailable(vllm-flash-attn)
+message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
+
+# Restore the install prefix
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
+
+# Copy over the vllm-flash-attn python files
+install(
+        DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+        DESTINATION vllm/vllm_flash_attn
+        COMPONENT vllm_flash_attn_c
+        FILES_MATCHING PATTERN "*.py"
+)
+
+# Nothing after vllm-flash-attn, see comment about macros above
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,56 +1,3 @@
 # Contributing to vLLM

-Thank you for your interest in contributing to vLLM!
-Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
-There are several ways you can contribute to the project:
-
- Identify and report any issues or bugs.
- Request or add a new model.
- Suggest or implement new features.
-
-However, remember that contributions aren't just about code.
-We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
-
-Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
-Talk about it in your blog posts, highlighting how it's driving your incredible projects.
-Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
-
-
-## Setup for development
-
-### Build from source
-
-```bash
-pip install -e .  # This may take several minutes.
-```
-
-### Testing
-
-```bash
-pip install -r requirements-dev.txt
-
-# linting and formatting
-bash format.sh
-# Static type checking
-mypy
-# Unit tests
-pytest tests/
-```
-**Note:** Currently, the repository does not pass the mypy tests.
-
-
-## Contributing Guidelines
-
-### Issue Reporting
-
-If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
-If not, please file a new issue, providing as much relevant information as possible.
-
-### Pull Requests & Code Reviews
-
-Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
-
-### Thank You
-
-Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
-Your contributions make vLLM a great tool for everyone!
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
--- a/34
+++ b/34
@ -0,0 +1,34 @@
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+    have the right to submit it under the open source license
+    indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+    of my knowledge, is covered under an appropriate open source
+    license and I have the right under that license to submit that
+    work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am
+    permitted to submit under a different license), as indicated
+    in the file; or
+
+(c) The contribution was provided directly to me by some other
+    person who certified (a), (b) or (c) and I have not modified
+    it.
+
+(d) I understand and agree that this project and the contribution
+    are public and that a record of the contribution (including all
+    personal information I submit with it, including my sign-off) is
+    maintained indefinitely and may be redistributed consistent with
+    this project or the open source license(s) involved.
--- a/48
+++ b/48
@ -27,6 +27,14 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+RUN apt-get install -y gcc-10 g++-10
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN <<EOF
+gcc --version
+EOF
+
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
@ -48,6 +56,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # see https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### BASE BUILD IMAGE ####################

 #################### WHEEL BUILD IMAGE ####################
@ -59,15 +70,10 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-build.txt

-# files and directories related to build wheels
-COPY csrc csrc
-COPY setup.py setup.py
-COPY cmake cmake
-COPY CMakeLists.txt CMakeLists.txt
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
-COPY pyproject.toml pyproject.toml
-COPY vllm vllm
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@ -76,14 +82,13 @@ ENV MAX_JOBS=${max_jobs}
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads

-ARG buildkite_commit
-ENV BUILDKITE_COMMIT=${buildkite_commit}
-
 ARG USE_SCCACHE
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
@ -92,6 +97,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
        && export SCCACHE_IDLE_TIMEOUT=0 \
        && export CMAKE_BUILD_TYPE=Release \
        && sccache --show-stats \
@ -102,6 +108,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git  \
    if [ "$USE_SCCACHE" != "1" ]; then \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi
@ -131,7 +138,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 #################### DEV IMAGE ####################
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
@ -169,6 +176,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/pip \
    . /etc/environment && \
    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+COPY examples examples
 #################### vLLM installation IMAGE ####################


@ -180,13 +188,17 @@ FROM vllm-base AS test
 ADD . /vllm-workspace/

 # install development dependencies (for testing)
-# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
-# This installation must complete before the test dependencies are collected and installed.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install "setuptools>=74.1.1"
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-dev.txt

+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+# Copy in the v1 package for testing (it isn't distributed yet)
+COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
+
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
 # will not be imported by other tests
@ -202,7 +214,7 @@ FROM vllm-base AS vllm-openai

 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -22,27 +22,17 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li

 RUN echo 'ulimit -c 0' >> ~/.bashrc

-RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
+RUN pip install intel_extension_for_pytorch==2.5.0

-ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
    pip install --upgrade pip && \
    pip install -r requirements-build.txt

-# install oneDNN
-RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
-
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
-    -DONEDNN_BUILD_DOC=OFF \ 
-    -DONEDNN_BUILD_EXAMPLES=OFF \ 
-    -DONEDNN_BUILD_TESTS=OFF \ 
-    -DONEDNN_BUILD_GRAPH=OFF \ 
-    -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
-    -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
-    cmake --build ./oneDNN/build --target install --config Release
-
 FROM cpu-test-1 AS build

 WORKDIR /workspace/vllm
@ -52,7 +42,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
    pip install -v -r requirements-cpu.txt

-COPY ./ ./
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
@ -60,8 +53,10 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}

 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl
+    pip install dist/*.whl && \
+    rm -rf dist

 WORKDIR /workspace/

--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@ -0,0 +1,18 @@
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -1,38 +1,41 @@
 # default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.19.1-ubuntu20.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"

 FROM $BASE_IMAGE

 RUN echo "Base image is $BASE_IMAGE"

 # Install some basic utilities
-RUN apt-get update \
-    && apt-get install python3 python3-pip -y \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update && \
+    apt-get install -y \
+        git \
+        python3 \
+        python3-pip \
+        ffmpeg libsm6 libxext6 libgl1

 ### Mount Point ###
 # When launching the container, mount the code directory to /app
 ARG APP_MOUNT=/app
 VOLUME [ ${APP_MOUNT} ]
-WORKDIR ${APP_MOUNT}
+WORKDIR ${APP_MOUNT}/vllm

 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U

-COPY ./vllm /app/vllm/vllm
-COPY ./setup.py /app/vllm/setup.py
-COPY ./requirements-common.txt /app/vllm/requirements-common.txt
-COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

-RUN cd /app/vllm \
-    && python3 -m pip install -U -r requirements-neuron.txt
+RUN python3 -m pip install -U \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        -r requirements-neuron.txt

 ENV VLLM_TARGET_DEVICE neuron
-RUN cd /app/vllm \
-    && pip install -e . \
-    && cd ..
+RUN --mount=type=bind,source=.git,target=.git \
+    pip install --no-build-isolation -v -e .

 CMD ["/bin/bash"]
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -4,27 +4,22 @@
 FROM ubuntu:22.04 AS dev

 RUN apt-get update -y && \
-    apt-get install -y python3-pip git && \
-    apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+    apt-get install -y \
+        git python3-pip \
+        ffmpeg libsm6 libxext6 libgl1
 WORKDIR /workspace

-# copy requirements
-COPY requirements-build.txt /workspace/vllm/
-COPY requirements-common.txt /workspace/vllm/
-COPY requirements-openvino.txt /workspace/vllm/
-
-COPY vllm/ /workspace/vllm/vllm
-COPY csrc/core /workspace/vllm/csrc/core
-COPY cmake/utils.cmake /workspace/vllm/cmake/
-COPY CMakeLists.txt /workspace/vllm/
-COPY setup.py /workspace/vllm/
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 # install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace

-COPY examples/ /workspace/vllm/examples
-COPY benchmarks/ /workspace/vllm/benchmarks
+COPY examples/ /workspace/examples
+COPY benchmarks/ /workspace/benchmarks

 CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -14,14 +14,23 @@ RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p
 COPY ./ /workspace/vllm

 WORKDIR /workspace/vllm
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi

 # These packages will be in rocketce eventually
-RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+RUN --mount=type=cache,target=/root/.cache/pip  \
+    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        torch==2.3.1 \
+        -r requirements-cpu.txt \
+        xformers uvloop==0.20.0

-RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+RUN --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py install

 WORKDIR /workspace/

 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -1,5 +1,5 @@
-# Default ROCm 6.1 base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+# Default ROCm 6.2 base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"

 # Default ROCm ARCHes to build vLLM for.
 ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
@ -7,18 +7,12 @@ ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
 # Whether to install CK-based flash-attention
 # If 0, will not install flash-attention
 ARG BUILD_FA="1"
-# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
-# If this succeeds, we use the downloaded wheel and skip building flash-attention.
-# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
-# architectures specified in `FA_GFX_ARCHS`
-ARG TRY_FA_WHEEL="1"
-ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-ARG FA_BRANCH="23a2b1c2"
+ARG FA_BRANCH="3cea2fb"

 # Whether to build triton on rocm
 ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="e0fc12c"
+ARG TRITON_BRANCH="e192dba"

 ### Base image build stage
 FROM $BASE_IMAGE AS base
@ -50,14 +44,17 @@ RUN python3 -m pip install --upgrade pip
 # Remove sccache so it doesn't interfere with ccache
 # TODO: implement sccache support across components
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
-# Install torch == 2.5.0 on ROCm
-RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.1"*) \
+
+# Install torch == 2.6.0 on ROCm
+RUN --mount=type=cache,target=/root/.cache/pip \
+    case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.2"*) \
            python3 -m pip uninstall -y torch torchvision \
-            && python3 -m pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240726 \
-                torchvision==0.20.0.dev20240726 \
-               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
+            && python3 -m pip install --pre \
+                torch==2.6.0.dev20240918 \
+                'setuptools-scm>=8' \
+                torchvision==0.20.0.dev20240918 \
+                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
        *) ;; esac

 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
@ -79,25 +76,18 @@ RUN cd /opt/rocm/share/amd_smi \
 ### Flash-Attention wheel build stage
 FROM base AS build_fa
 ARG BUILD_FA
-ARG TRY_FA_WHEEL
-ARG FA_WHEEL_URL
 ARG FA_GFX_ARCHS
 ARG FA_BRANCH
 # Build ROCm flash-attention wheel if `BUILD_FA = 1`
 RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_FA" = "1" ]; then \
-        if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
-            # If a suitable wheel exists, we download it instead of building FA
-            mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
-        else \
-            mkdir -p libs \
-            && cd libs \
-            && git clone https://github.com/ROCm/flash-attention.git \
-            && cd flash-attention \
-            && git checkout "${FA_BRANCH}" \
-            && git submodule update --init \
-            && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
-        fi; \
+        mkdir -p libs \
+        && cd libs \
+        && git clone https://github.com/ROCm/flash-attention.git \
+        && cd flash-attention \
+        && git checkout "${FA_BRANCH}" \
+        && git submodule update --init \
+        && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
    # Create an empty directory otherwise as later build stages expect one
    else mkdir -p /install; \
    fi
@ -112,6 +102,7 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_TRITON" = "1" ]; then \
    mkdir -p libs \
    && cd libs \
+    && python3 -m pip install ninja cmake wheel pybind11 \
    && git clone https://github.com/OpenAI/triton.git \
    && cd triton \
    && git checkout "${TRITON_BRANCH}" \
@ -126,10 +117,15 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
 FROM base AS final
 # Import the vLLM development directory from the build context
 COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN python3 -m pip install --upgrade pip

 # Package upgrades for useful functionality or to avoid dependency issues
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
+    python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard


 # Workaround for ray >= 2.10.0
@ -138,15 +134,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false

 RUN --mount=type=cache,target=${CCACHE_DIR} \
+    --mount=type=bind,source=.git,target=.git \
    --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -Ur requirements-rocm.txt \
-    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.1"*) \
-            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
-            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
-            # Prevent interference if torch bundles its own HIP runtime
-            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
-        *) ;; esac \
    && python3 setup.py clean --all \
    && python3 setup.py develop

--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@ -1,20 +1,25 @@
-ARG NIGHTLY_DATE="20240828"
+ARG NIGHTLY_DATE="20241017"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"

 FROM $BASE_IMAGE
-WORKDIR /workspace
+WORKDIR /workspace/vllm

 # Install some basic utilities
-RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
-
-# Install the TPU and Pallas dependencies.
-RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+RUN apt-get update && apt-get install -y \
+    git \
+    ffmpeg libsm6 libxext6 libgl1

 # Build vLLM.
-COPY . /workspace/vllm
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
 ENV VLLM_TARGET_DEVICE="tpu"
-RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt
-RUN cd /workspace/vllm && python3 setup.py develop
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    python3 -m pip install \
+        -r requirements-tpu.txt
+RUN python3 setup.py develop

 CMD ["/bin/bash"]
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,21 +1,68 @@
-FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
+FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base

 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg

-RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 
-COPY ./ /workspace/vllm
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    ffmpeg \
+    git \
+    libsndfile1 \
+    libsm6 \
+    libxext6 \
+    libgl1 \
+    lsb-release \
+    numactl \
+    python3 \
+    python3-dev \
+    python3-pip \
+    # vim \
+    wget

 WORKDIR /workspace/vllm
+COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
+COPY requirements-common.txt /workspace/vllm/requirements-common.txt

-RUN pip install -v -r requirements-xpu.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --no-cache-dir \
+    -r requirements-xpu.txt

-RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
+RUN git clone https://github.com/intel/pti-gpu && \
+    cd pti-gpu/sdk && \
+    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
+    mkdir build && \
+    cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
+    make -j && \
+    cmake --install . --config Release --prefix "/usr/local"
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+
+COPY . .
+ARG GIT_REPO_CHECK
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
+ENV VLLM_TARGET_DEVICE=xpu
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    python3 setup.py install

 CMD ["/bin/bash"]
+
+FROM vllm-base AS vllm-openai
+
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+
+ENV VLLM_USAGE_SOURCE production-docker-image \
+    TRITON_XPU_PROFILE 1
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/README.md
+++ b/README.md
@ -10,22 +10,15 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
-
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

-
---
-
-**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
-
-We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
-Join us to learn more about recent advancements of vLLM on MI300X.
-Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
-
 ---

 *Latest News* 🔥
+- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing).
+- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
+- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
 - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
@ -51,7 +44,7 @@ vLLM is fast with:
 - Speculative decoding
 - Chunked prefill

-**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.

 vLLM is flexible and easy to use with:

@ -136,5 +129,6 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 * For technical questions and feature requests, please use Github issues or discussions.
 * For discussing with fellow users, please use Discord.
+* For coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
-* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
--- a/SECURITY.md
+++ b/SECURITY.md
@ -0,0 +1,11 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
+
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
+
+---
+
+Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -6,3 +6,14 @@ You can download the dataset by running:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
+
+## Downloading the ShareGPT4V dataset
+
+The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
+will ignore a datapoint if the referred image is missing.
+```bash
+wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
+mkdir coco -p
+wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
+unzip coco/train2017.zip -d coco/
+```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -23,8 +23,9 @@ class RequestFuncInput:
    output_len: int
    model: str
    best_of: int = 1
-    use_beam_search: bool = False
    logprobs: Optional[int] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False


@dataclass
@ -47,13 +48,13 @@ async def async_request_tgi(
    assert api_url.endswith("generate_stream")

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        params = {
            "best_of": request_func_input.best_of,
            "max_new_tokens": request_func_input.output_len,
            "do_sample": True,
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            # TGI does not accept ignore_eos flag.
        }
        payload = {
            "inputs": request_func_input.prompt,
@ -78,7 +79,7 @@ async def async_request_tgi(
                        # any data, we should skip it.
                        if chunk_bytes.startswith(":"):
                            continue
-                        chunk = remove_prefix(chunk_bytes, "data:")
+                        chunk = chunk_bytes.removeprefix("data:")

                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
@ -118,7 +119,6 @@ async def async_request_trt_llm(
    assert api_url.endswith("generate_stream")

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        assert request_func_input.best_of == 1
        payload = {
            "accumulate_tokens": True,
@ -128,6 +128,8 @@ async def async_request_trt_llm(
            "max_tokens": request_func_input.output_len,
            "stream": True,
        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

@ -142,8 +144,8 @@ async def async_request_trt_llm(
                        if not chunk_bytes:
                            continue

-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data:")

                        data = json.loads(chunk)
                        output.generated_text += data["text_output"]
@ -182,7 +184,6 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert request_func_input.best_of == 1
-        assert not request_func_input.use_beam_search

        payload = {
            "prompt": request_func_input.prompt,
@ -230,7 +231,6 @@ async def async_request_openai_completions(
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
        payload = {
            "model": request_func_input.model,
            "prompt": request_func_input.prompt,
@ -239,6 +239,7 @@ async def async_request_openai_completions(
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
+            "ignore_eos": request_func_input.ignore_eos,
        }
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
@ -255,13 +256,14 @@ async def async_request_openai_completions(
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
+                    first_chunk_received = False
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue

-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
@ -273,7 +275,8 @@ async def async_request_openai_completions(
                            if data["choices"][0]["text"]:
                                timestamp = time.perf_counter()
                                # First token
-                                if ttft == 0.0:
+                                if not first_chunk_received:
+                                    first_chunk_received = True
                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft

@ -284,9 +287,14 @@ async def async_request_openai_completions(

                                most_recent_timestamp = timestamp
                                generated_text += data["choices"][0]["text"]
-
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
                    output.generated_text = generated_text
-                    output.success = True
                    output.latency = latency
                else:
                    output.error = response.reason or ""
@ -311,18 +319,21 @@ async def async_request_openai_chat_completions(
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
        payload = {
            "model": request_func_input.model,
            "messages": [
                {
                    "role": "user",
-                    "content": request_func_input.prompt,
+                    "content": content
                },
            ],
            "temperature": 0.0,
-            "max_tokens": request_func_input.output_len,
+            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
+            "ignore_eos": request_func_input.ignore_eos,
        }
        headers = {
            "Content-Type": "application/json",
@ -345,8 +356,8 @@ async def async_request_openai_chat_completions(
                        if not chunk_bytes:
                            continue

-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
@ -385,14 +396,6 @@ async def async_request_openai_chat_completions(
    return output


-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
-    if text.startswith(prefix):
-        return text[len(prefix):]
-    return text
-
-
 def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download
@ -426,4 +429,5 @@ ASYNC_REQUEST_FUNCS = {
    "openai-chat": async_request_openai_chat_completions,
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
 }
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -1,5 +1,6 @@
 """Benchmark the latency of processing a single batch of requests."""
 import argparse
+import dataclasses
 import json
 import time
 from pathlib import Path
@ -10,50 +11,24 @@ import torch
 from tqdm import tqdm

 from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
-from vllm.inputs import PromptInputs
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
 from vllm.utils import FlexibleArgumentParser


 def main(args: argparse.Namespace):
    print(args)

+    engine_args = EngineArgs.from_cli_args(args)
+
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
-    llm = LLM(
-        model=args.model,
-        speculative_model=args.speculative_model,
-        num_speculative_tokens=args.num_speculative_tokens,
-        speculative_draft_tensor_parallel_size=\
-            args.speculative_draft_tensor_parallel_size,
-        tokenizer=args.tokenizer,
-        quantization=args.quantization,
-        tensor_parallel_size=args.tensor_parallel_size,
-        trust_remote_code=args.trust_remote_code,
-        dtype=args.dtype,
-        max_model_len=args.max_model_len,
-        enforce_eager=args.enforce_eager,
-        kv_cache_dtype=args.kv_cache_dtype,
-        quantization_param_path=args.quantization_param_path,
-        device=args.device,
-        ray_workers_use_nsight=args.ray_workers_use_nsight,
-        use_v2_block_manager=args.use_v2_block_manager,
-        enable_chunked_prefill=args.enable_chunked_prefill,
-        download_dir=args.download_dir,
-        block_size=args.block_size,
-        gpu_memory_utilization=args.gpu_memory_utilization,
-        load_format=args.load_format,
-        distributed_executor_backend=args.distributed_executor_backend,
-        otlp_traces_endpoint=args.otlp_traces_endpoint,
-        enable_prefix_caching=args.enable_prefix_caching,
-    )
+    llm = LLM(**dataclasses.asdict(engine_args))

    sampling_params = SamplingParams(
        n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
+        temperature=1.0,
        top_p=1.0,
-        use_beam_search=args.use_beam_search,
        ignore_eos=True,
        max_tokens=args.output_len,
    )
@ -61,7 +36,7 @@ def main(args: argparse.Namespace):
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
-    dummy_inputs: List[PromptInputs] = [{
+    dummy_prompts: List[PromptType] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]

@ -74,13 +49,13 @@ def main(args: argparse.Namespace):
                    ],
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        str(profile_dir))) as p:
-                llm.generate(dummy_inputs,
+                llm.generate(dummy_prompts,
                             sampling_params=sampling_params,
                             use_tqdm=False)
            print(p.key_averages())
        else:
            start_time = time.perf_counter()
-            llm.generate(dummy_inputs,
+            llm.generate(dummy_prompts,
                         sampling_params=sampling_params,
                         use_tqdm=False)
            end_time = time.perf_counter()
@ -127,19 +102,6 @@ if __name__ == '__main__':
    parser = FlexibleArgumentParser(
        description='Benchmark the latency of processing a single batch of '
        'requests till completion.')
-    parser.add_argument('--model', type=str, default='facebook/opt-125m')
-    parser.add_argument('--speculative-model', type=str, default=None)
-    parser.add_argument('--num-speculative-tokens', type=int, default=None)
-    parser.add_argument('--speculative-draft-tensor-parallel-size',
-                        '-spec-draft-tp',
-                        type=int,
-                        default=None)
-    parser.add_argument('--tokenizer', type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=[*QUANTIZATION_METHODS, None],
-                        default=None)
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--input-len', type=int, default=32)
    parser.add_argument('--output-len', type=int, default=128)
    parser.add_argument('--batch-size', type=int, default=8)
@ -156,45 +118,6 @@ if __name__ == '__main__':
                        type=int,
                        default=30,
                        help='Number of iterations to run.')
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    parser.add_argument('--enforce-eager',
-                        action='store_true',
-                        help='enforce eager mode and disable CUDA graph')
-    parser.add_argument(
-        '--kv-cache-dtype',
-        type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-        default="auto",
-        help='Data type for kv cache storage. If "auto", will use model '
-        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-    parser.add_argument(
-        '--quantization-param-path',
-        type=str,
-        default=None,
-        help='Path to the JSON file containing the KV cache scaling factors. '
-        'This should generally be supplied, when KV cache dtype is FP8. '
-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
-        'instead supported for common inference criteria.')
    parser.add_argument(
        '--profile',
        action='store_true',
@ -205,79 +128,12 @@ if __name__ == '__main__':
        default=None,
        help=('path to save the pytorch profiler output. Can be visualized '
              'with ui.perfetto.dev or Tensorboard.'))
-    parser.add_argument("--device",
-                        type=str,
-                        default="auto",
-                        choices=DEVICE_OPTIONS,
-                        help='device type for vLLM execution')
-    parser.add_argument('--block-size',
-                        type=int,
-                        default=16,
-                        help='block size of key/value cache')
-    parser.add_argument(
-        '--enable-chunked-prefill',
-        action='store_true',
-        help='If True, the prefill requests can be chunked based on the '
-        'max_num_batched_tokens')
-    parser.add_argument("--enable-prefix-caching",
-                        action='store_true',
-                        help="Enable automatic prefix caching")
-    parser.add_argument('--use-v2-block-manager', action='store_true')
-    parser.add_argument(
-        "--ray-workers-use-nsight",
-        action='store_true',
-        help="If specified, use nsight to profile ray workers",
-    )
-    parser.add_argument('--download-dir',
-                        type=str,
-                        default=None,
-                        help='directory to download and load the weights, '
-                        'default to the default cache dir of huggingface')
    parser.add_argument(
        '--output-json',
        type=str,
        default=None,
        help='Path to save the latency results in JSON format.')
-    parser.add_argument('--gpu-memory-utilization',
-                        type=float,
-                        default=0.9,
-                        help='the fraction of GPU memory to be used for '
-                        'the model executor, which can range from 0 to 1.'
-                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument(
-        '--load-format',
-        type=str,
-        default=EngineArgs.load_format,
-        choices=[
-            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-            'bitsandbytes'
-        ],
-        help='The format of the model weights to load.\n\n'
-        '* "auto" will try to load the weights in the safetensors format '
-        'and fall back to the pytorch bin format if safetensors format '
-        'is not available.\n'
-        '* "pt" will load the weights in the pytorch bin format.\n'
-        '* "safetensors" will load the weights in the safetensors format.\n'
-        '* "npcache" will load the weights in pytorch format and store '
-        'a numpy cache to speed up the loading.\n'
-        '* "dummy" will initialize the weights with random values, '
-        'which is mainly for profiling.\n'
-        '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-        'section for more information.\n'
-        '* "bitsandbytes" will load the weights using bitsandbytes '
-        'quantization.\n')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    parser.add_argument(
-        '--otlp-traces-endpoint',
-        type=str,
-        default=None,
-        help='Target URL to which OpenTelemetry traces will be sent.')
+
+    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -25,6 +25,7 @@ ShareGPT example usage:
        --input-length-range 128:256
 """

+import dataclasses
 import json
 import random
 import time
@ -33,6 +34,7 @@ from typing import List, Optional, Tuple
 from transformers import PreTrainedTokenizerBase

 from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
 from vllm.utils import FlexibleArgumentParser

 try:
@ -113,10 +115,10 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
 def main(args):
    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
    input_length_range = tuple(map(int, args.input_length_range.split(':')))
-
+    random.seed(args.seed)
    if args.dataset_path is not None:
        print(f"Start to sample {args.num_prompts} prompts"
-              "from {args.dataset_path}")
+              f"from {args.dataset_path}")
        filtered_datasets = sample_requests(
            dataset_path=args.dataset_path,
            num_requests=args.num_prompts,
@ -129,13 +131,9 @@ def main(args):
        filtered_datasets = [(PROMPT, prompt_len, args.output_len)
                             ] * args.num_prompts

-    llm = LLM(model=args.model,
-              tokenizer_mode='auto',
-              trust_remote_code=True,
-              enforce_eager=True,
-              use_v2_block_manager=args.use_v2_block_manager,
-              tensor_parallel_size=args.tensor_parallel_size,
-              enable_prefix_caching=args.enable_prefix_caching)
+    engine_args = EngineArgs.from_cli_args(args)
+
+    llm = LLM(**dataclasses.asdict(engine_args))

    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)

@ -144,13 +142,6 @@ def main(args):
                                       repeat_count=args.repeat_count,
                                       sort=args.sort)

-    print("------warm up------")
-    test_prefix(
-        llm=llm,
-        prompts=prompts,
-        sampling_params=sampling_params,
-    )
-
    print("------start generating------")
    test_prefix(
        llm=llm,
@ -163,21 +154,11 @@ if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description=
        'Benchmark the performance with or without automatic prefix caching.')
-    parser.add_argument('--model',
-                        type=str,
-                        default='baichuan-inc/Baichuan2-13B-Chat')
    parser.add_argument("--dataset-path",
                        type=str,
                        default=None,
                        help="Path to the dataset.")
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--output-len', type=int, default=10)
-    parser.add_argument('--enable-prefix-caching',
-                        action='store_true',
-                        help='enable prefix caching')
-    parser.add_argument('--use-v2-block-manager',
-                        action='store_true',
-                        help='Use BlockSpaceMangerV2')
    parser.add_argument('--num-prompts',
                        type=int,
                        default=1,
@ -194,5 +175,7 @@ if __name__ == "__main__":
                        default='128:256',
                        help='Range of input lengths for sampling prompts,'
                        'specified as "min:max" (e.g., "128:256").')
+
+    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -0,0 +1,177 @@
+"""Benchmark offline prioritization."""
+import argparse
+import dataclasses
+import json
+import random
+import time
+from typing import List, Optional, Tuple
+
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+
+        #Select a equi-probable random priority
+        priority = 0 if random.random() < 0.5 else 1
+
+        filtered_dataset.append((prompt, prompt_len, output_len, priority))
+
+    return filtered_dataset
+
+
+def run_vllm(
+    requests: List[Tuple[str, int, int]],
+    n: int,
+    engine_args: EngineArgs,
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # Add the requests to the engine.
+    prompts = []
+    sampling_params = []
+    priority = []
+    for prompt, _, output_len, _priority in requests:
+        prompts.append(prompt)
+        priority.append(_priority)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
+
+    start = time.perf_counter()
+    llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [(prompt, args.input_len, args.output_len)
+                    for _ in range(args.num_prompts)]
+    else:
+        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+                                   args.output_len)
+
+    if args.backend == "vllm":
+        elapsed_time = run_vllm(requests, args.n,
+                                EngineArgs.from_cli_args(args))
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(prompt_len + output_len
+                           for _, prompt_len, output_len, priority in requests)
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii"],
+                        default="vllm")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=200,
+                        help="Number of prompts to process.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -1,4 +1,4 @@
-"""Benchmark online serving throughput.
+r"""Benchmark online serving throughput.

 On the server side, run one of the following commands:
    vLLM OpenAI API server
@ -24,6 +24,8 @@ On the client side, run:
 """
 import argparse
 import asyncio
+import base64
+import io
 import json
 import os
 import random
@ -31,11 +33,13 @@ import time
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
+from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple

 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                  RequestFuncOutput)
+from datasets import load_dataset
+from PIL.Image import Image
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase

@ -49,6 +53,8 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+

@dataclass
 class BenchmarkMetrics:
@ -56,6 +62,7 @@ class BenchmarkMetrics:
    total_input: int
    total_output: int
    request_throughput: float
+    request_goodput: float
    output_throughput: float
    total_token_throughput: float
    mean_ttft_ms: float
@ -84,11 +91,9 @@ def sample_sharegpt_requests(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, int, int]]:
-    if fixed_output_len is not None and fixed_output_len < 4:
-        raise ValueError("output_len too small")
+) -> List[Tuple[str, int, int, None]]:
    # Load the dataset.
-    with open(dataset_path) as f:
+    with open(dataset_path, encoding='utf-8') as f:
        dataset = json.load(f)
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
@ -113,13 +118,13 @@ def sample_sharegpt_requests(
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
+        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
            # Prune too short sequences.
            continue
        if prompt_len > 1024 or prompt_len + output_len > 2048:
            # Prune too long sequences.
            continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
+        filtered_dataset.append((prompt, prompt_len, output_len, None))

    return filtered_dataset

@ -131,13 +136,13 @@ def sample_sonnet_requests(
    output_len: int,
    prefix_len: int,
    tokenizer: PreTrainedTokenizerBase,
-) -> List[Tuple[str, str, int, int]]:
+) -> List[Tuple[str, str, int, int, None]]:
    assert (
        input_len > prefix_len
    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."

    # Load the dataset.
-    with open(dataset_path) as f:
+    with open(dataset_path, encoding='utf-8') as f:
        poem_lines = f.readlines()

    # Tokenize the poem lines.
@ -174,9 +179,9 @@ def sample_sonnet_requests(
    # Sample the rest of lines per request.
    sampled_requests: List[Tuple[str, int, int]] = []
    for _ in range(num_requests):
-        sampled_lines = "".join(
-            prefix_lines +
-            random.sample(poem_lines, num_input_lines - num_prefix_lines))
+        num_lines_needed = num_input_lines - num_prefix_lines
+        sampled_lines = "".join(prefix_lines +
+                                random.choices(poem_lines, k=num_lines_needed))

        prompt = f"{base_prompt}{sampled_lines}"
        message = [
@ -189,7 +194,67 @@ def sample_sonnet_requests(
            message, add_generation_prompt=True, tokenize=False)
        prompt_len = len(tokenizer(prompt_formatted).input_ids)
        sampled_requests.append(
-            (prompt, prompt_formatted, prompt_len, output_len))
+            (prompt, prompt_formatted, prompt_len, output_len, None))
+
+    return sampled_requests
+
+
+def sample_hf_requests(
+    dataset_path: str,
+    dataset_subset: str,
+    dataset_split: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    random_seed: int,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+    dataset = load_dataset(dataset_path,
+                           name=dataset_subset,
+                           split=dataset_split,
+                           streaming=True)
+    assert "conversations" in dataset.features, (
+        "HF Dataset must have 'conversations' column.")
+    filter_func = lambda x: len(x["conversations"]) >= 2
+    filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
+    sampled_requests: List[Tuple[str, int, int, Dict[str,
+                                                     Collection[str]]]] = []
+    for data in filtered_dataset:
+        if len(sampled_requests) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = data["conversations"][0]["value"]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = data["conversations"][1]["value"]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
+            # Prune too short sequences.
+            continue
+        if fixed_output_len is None and \
+            (prompt_len > 1024 or prompt_len + output_len > 2048):
+            # Prune too long sequences.
+            continue
+
+        if "image" in data and isinstance(data["image"], Image):
+            image: Image = data["image"]
+            image = image.convert("RGB")
+            image_data = io.BytesIO()
+            image.save(image_data, format='JPEG')
+            image_base64 = base64.b64encode(
+                image_data.getvalue()).decode("utf-8")
+            mm_content = {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{image_base64}"
+                },
+            }
+        else:
+            mm_content = None
+
+        sampled_requests.append((prompt, prompt_len, output_len, mm_content))

    return sampled_requests

@ -223,8 +288,8 @@ def sample_random_requests(
                                  [(offsets[i] + i + j) % tokenizer.vocab_size
                                   for j in range(input_lens[i])])

-        input_requests.append(
-            (prompt, int(prefix_len + input_lens[i]), int(output_lens[i])))
+        input_requests.append((prompt, int(prefix_len + input_lens[i]),
+                               int(output_lens[i]), None))

    return input_requests

@ -232,8 +297,33 @@ def sample_random_requests(
 async def get_request(
    input_requests: List[Tuple[str, int, int]],
    request_rate: float,
+    burstiness: float = 1.0,
 ) -> AsyncGenerator[Tuple[str, int, int], None]:
+    """
+    Asynchronously generates requests at a specified rate 
+    with OPTIONAL burstiness.
+    
+    Args:
+        input_requests: 
+            A list of input requests, each represented as a tuple.
+        request_rate: 
+            The rate at which requests are generated (requests/s).
+        burstiness (optional): 
+            The burstiness factor of the request generation. 
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results 
+            in more bursty requests, while a higher burstiness value 
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
    input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}.")
+    theta = 1.0 / (request_rate * burstiness)
+
    for request in input_requests:
        yield request

@ -241,8 +331,9 @@ async def get_request(
            # If the request rate is infinity, then we don't need to wait.
            continue

-        # Sample the request interval from the exponential distribution.
-        interval = np.random.exponential(1.0 / request_rate)
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
        # The next request will be sent after the interval.
        await asyncio.sleep(interval)

@ -254,12 +345,15 @@ def calculate_metrics(
    tokenizer: PreTrainedTokenizerBase,
    selected_percentile_metrics: List[str],
    selected_percentiles: List[float],
+    gootput_config_dict: Dict[str, float],
 ) -> Tuple[BenchmarkMetrics, List[int]]:
    actual_output_lens: List[int] = []
    total_input = 0
    completed = 0
+    good_completed = 0
    itls: List[float] = []
    tpots: List[float] = []
+    all_tpots: List[float] = []
    ttfts: List[float] = []
    e2els: List[float] = []
    for i in range(len(outputs)):
@ -273,9 +367,13 @@ def calculate_metrics(
                          add_special_tokens=False).input_ids)
            actual_output_lens.append(output_len)
            total_input += input_requests[i][1]
+            tpot = 0
            if output_len > 1:
-                tpots.append(
-                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
+                                                                 1)
+                tpots.append(tpot)
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
            itls += outputs[i].itl
            ttfts.append(outputs[i].ttft)
            e2els.append(outputs[i].latency)
@ -283,6 +381,28 @@ def calculate_metrics(
        else:
            actual_output_lens.append(0)

+    if gootput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in gootput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(gootput_config_dict["ttft"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "tpot" in gootput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(gootput_config_dict["tpot"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "e2el" in gootput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(gootput_config_dict["e2el"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
    if completed == 0:
        warnings.warn(
            "All requests failed. This is likely due to a misconfiguration "
@ -293,6 +413,7 @@ def calculate_metrics(
        total_input=total_input,
        total_output=sum(actual_output_lens),
        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
        output_throughput=sum(actual_output_lens) / dur_s,
        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
        mean_ttft_ms=np.mean(ttfts or 0) *
@ -311,9 +432,9 @@ def calculate_metrics(
        median_itl_ms=np.median(itls or 0) * 1000,
        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
                            for p in selected_percentiles],
-        mean_e2el_ms=np.median(e2els or 0) * 1000,
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
        std_e2el_ms=np.std(e2els or 0) * 1000,
-        median_e2el_ms=np.mean(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
                             for p in selected_percentiles],
    )
@ -330,12 +451,15 @@ async def benchmark(
    input_requests: List[Tuple[str, int, int]],
    logprobs: Optional[int],
    best_of: int,
-    use_beam_search: bool,
    request_rate: float,
+    burstiness: float,
    disable_tqdm: bool,
    profile: bool,
    selected_percentile_metrics: List[str],
    selected_percentiles: List[str],
+    ignore_eos: bool,
+    gootput_config_dict: Dict[str, float],
+    max_concurrency: Optional[int],
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -343,7 +467,12 @@ async def benchmark(
        raise ValueError(f"Unknown backend: {backend}")

    print("Starting initial single prompt test run...")
-    test_prompt, test_prompt_len, test_output_len = input_requests[0]
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
+        input_requests[0])
+    if backend != "openai-chat" and test_mm_content is not None:
+        # multi-modal benchmark is only available on OpenAI Chat backend.
+        raise ValueError(
+            "Multi-modal content is only supported on 'openai-chat' backend.")
    test_input = RequestFuncInput(
        model=model_id,
        prompt=test_prompt,
@ -352,7 +481,8 @@ async def benchmark(
        output_len=test_output_len,
        logprobs=logprobs,
        best_of=best_of,
-        use_beam_search=use_beam_search,
+        multi_modal_content=test_mm_content,
+        ignore_eos=ignore_eos,
    )
    test_output = await request_func(request_func_input=test_input)
    if not test_output.success:
@ -364,42 +494,62 @@ async def benchmark(

    if profile:
        print("Starting profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/start_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            logprobs=logprobs,
-            best_of=best_of,
-            use_beam_search=use_beam_search,
-        )
+        profile_input = RequestFuncInput(model=model_id,
+                                         prompt=test_prompt,
+                                         api_url=base_url + "/start_profile",
+                                         prompt_len=test_prompt_len,
+                                         output_len=test_output_len,
+                                         logprobs=logprobs,
+                                         best_of=best_of,
+                                         multi_modal_content=test_mm_content,
+                                         ignore_eos=ignore_eos)
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")

+    if burstiness == 1.0:
+        distribution = "Poisson process"
+    else:
+        distribution = "Gamma distribution"
+
    print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")

    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = (asyncio.Semaphore(max_concurrency)
+                 if max_concurrency else None)
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+
    benchmark_start_time = time.perf_counter()
    tasks: List[asyncio.Task] = []
-    async for request in get_request(input_requests, request_rate):
-        prompt, prompt_len, output_len = request
-        request_func_input = RequestFuncInput(
-            model=model_id,
-            prompt=prompt,
-            api_url=api_url,
-            prompt_len=prompt_len,
-            output_len=output_len,
-            logprobs=logprobs,
-            best_of=best_of,
-            use_beam_search=use_beam_search,
-        )
+    async for request in get_request(input_requests, request_rate, burstiness):
+        prompt, prompt_len, output_len, mm_content = request
+        request_func_input = RequestFuncInput(model=model_id,
+                                              prompt=prompt,
+                                              api_url=api_url,
+                                              prompt_len=prompt_len,
+                                              output_len=output_len,
+                                              logprobs=logprobs,
+                                              best_of=best_of,
+                                              multi_modal_content=mm_content,
+                                              ignore_eos=ignore_eos)
        tasks.append(
            asyncio.create_task(
-                request_func(request_func_input=request_func_input,
-                             pbar=pbar)))
+                limited_request_func(request_func_input=request_func_input,
+                                     pbar=pbar)))
    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

    if profile:
@ -412,7 +562,6 @@ async def benchmark(
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
-            use_beam_search=use_beam_search,
        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
@ -430,6 +579,7 @@ async def benchmark(
        tokenizer=tokenizer,
        selected_percentile_metrics=selected_percentile_metrics,
        selected_percentiles=selected_percentiles,
+        gootput_config_dict=gootput_config_dict,
    )

    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@ -441,6 +591,9 @@ async def benchmark(
                                 metrics.total_output))
    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                    metrics.request_throughput))
+    if gootput_config_dict:
+        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
+                                        metrics.request_goodput))
    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
                                    metrics.output_throughput))
    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
@ -452,6 +605,8 @@ async def benchmark(
        "total_input_tokens": metrics.total_input,
        "total_output_tokens": metrics.total_output,
        "request_throughput": metrics.request_throughput,
+        "request_goodput:":
+        metrics.request_goodput if gootput_config_dict else None,
        "output_throughput": metrics.output_throughput,
        "total_token_throughput": metrics.total_token_throughput,
        "input_lens": [output.prompt_len for output in outputs],
@ -470,7 +625,7 @@ async def benchmark(
        # E.g., "Time to First Token"
        metric_header: str,
    ):
-        # This function print and add statistics of the specified
+        # This function prints and adds statistics of the specified
        # metric.
        if metric_attribute_name not in selected_percentile_metrics:
            return
@ -505,6 +660,41 @@ async def benchmark(
    return result


+def check_goodput_args(args):
+    # Check and parse goodput arguments
+    gootput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        gootput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in gootput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. ")
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative.")
+    return gootput_config_dict
+
+
+def parse_goodput(slo_pairs):
+    gootput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            gootput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds.") from err
+    return gootput_config_dict
+
+
 def main(args: argparse.Namespace):
    print(args)
    random.seed(args.seed)
@ -556,9 +746,9 @@ def main(args: argparse.Namespace):
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
            )
-            input_requests = [(prompt, prompt_len, output_len)
+            input_requests = [(prompt, prompt_len, output_len, None)
                              for prompt, prompt_formatted, prompt_len,
-                              output_len in input_requests]
+                              output_len, _ in input_requests]
        else:
            assert (
                tokenizer.chat_template or tokenizer.default_chat_template
@ -571,9 +761,20 @@ def main(args: argparse.Namespace):
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
            )
-            input_requests = [(prompt_formatted, prompt_len, output_len)
+            input_requests = [(prompt_formatted, prompt_len, output_len, None)
                              for prompt, prompt_formatted, prompt_len,
-                              output_len in input_requests]
+                              output_len, _ in input_requests]
+
+    elif args.dataset_name == "hf":
+        input_requests = sample_hf_requests(
+            dataset_path=args.dataset_path,
+            dataset_subset=args.hf_subset,
+            dataset_split=args.hf_split,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            random_seed=args.seed,
+            fixed_output_len=args.hf_output_len,
+        )

    elif args.dataset_name == "random":
        input_requests = sample_random_requests(
@ -588,6 +789,8 @@ def main(args: argparse.Namespace):
    else:
        raise ValueError(f"Unknown dataset: {args.dataset_name}")

+    gootput_config_dict = check_goodput_args(args)
+
    benchmark_result = asyncio.run(
        benchmark(
            backend=backend,
@ -598,14 +801,17 @@ def main(args: argparse.Namespace):
            input_requests=input_requests,
            logprobs=args.logprobs,
            best_of=args.best_of,
-            use_beam_search=args.use_beam_search,
            request_rate=args.request_rate,
+            burstiness=args.burstiness,
            disable_tqdm=args.disable_tqdm,
            profile=args.profile,
            selected_percentile_metrics=args.percentile_metrics.split(","),
            selected_percentiles=[
                float(p) for p in args.metric_percentiles.split(",")
            ],
+            ignore_eos=args.ignore_eos,
+            gootput_config_dict=gootput_config_dict,
+            max_concurrency=args.max_concurrency,
        ))

    # Save config and results to json
@ -619,7 +825,6 @@ def main(args: argparse.Namespace):
        result_json["model_id"] = model_id
        result_json["tokenizer_id"] = tokenizer_id
        result_json["best_of"] = args.best_of
-        result_json["use_beam_search"] = args.use_beam_search
        result_json["num_prompts"] = args.num_prompts

        # Metadata
@ -636,18 +841,22 @@ def main(args: argparse.Namespace):
        # Traffic
        result_json["request_rate"] = (
            args.request_rate if args.request_rate < float("inf") else "inf")
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency

        # Merge with benchmark result
        result_json = {**result_json, **benchmark_result}

        # Save to file
        base_model_id = model_id.split("/")[-1]
-        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
+        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
+                               if args.max_concurrency is not None else "")
+        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  #noqa
        if args.result_filename:
            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name, "w") as outfile:
+        with open(file_name, "w", encoding='utf-8') as outfile:
            json.dump(result_json, outfile)


@ -685,13 +894,27 @@ if __name__ == "__main__":
        "--dataset-name",
        type=str,
        default="sharegpt",
-        choices=["sharegpt", "sonnet", "random"],
+        choices=["sharegpt", "sonnet", "random", "hf"],
        help="Name of the dataset to benchmark on.",
    )
    parser.add_argument("--dataset-path",
                        type=str,
                        default=None,
-                        help="Path to the dataset.")
+                        help="Path to the sharegpt/sonnet dataset. "
+                        "Or the huggingface dataset ID if using HF dataset.")
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.")
+
    parser.add_argument(
        "--model",
        type=str,
@ -718,26 +941,6 @@ if __name__ == "__main__":
        default=1000,
        help="Number of prompts to process.",
    )
-    parser.add_argument(
-        "--sharegpt-output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the output length "
-        "from the ShareGPT dataset.")
-    parser.add_argument(
-        "--sonnet-input-len",
-        type=int,
-        default=550,
-        help=
-        "Number of input tokens per request, used only for sonnet dataset.",
-    )
-    parser.add_argument(
-        "--sonnet-output-len",
-        type=int,
-        default=150,
-        help=
-        "Number of output tokens per request, used only for sonnet dataset.",
-    )
    parser.add_argument(
        "--logprobs",
        type=int,
@ -748,50 +951,26 @@ if __name__ == "__main__":
              "logprob is returned for each token; or (2) if beam search "
              "is enabled 1 logprob per token is computed"),
    )
-    parser.add_argument(
-        "--sonnet-prefix-len",
-        type=int,
-        default=200,
-        help=
-        "Number of prefix tokens per request, used only for sonnet dataset.",
-    )
-    parser.add_argument(
-        "--random-input-len",
-        type=int,
-        default=1024,
-        help=
-        "Number of input tokens per request, used only for random sampling.",
-    )
-    parser.add_argument(
-        "--random-output-len",
-        type=int,
-        default=128,
-        help=
-        "Number of output tokens per request, used only for random sampling.",
-    )
-    parser.add_argument(
-        "--random-range-ratio",
-        type=float,
-        default=1.0,
-        help="Range of sampled ratio of input/output length, "
-        "used only for random sampling.",
-    )
-    parser.add_argument(
-        "--random-prefix-len",
-        type=int,
-        default=0,
-        help="Number of fixed prefix tokens before random "
-        " context. The length range of context in a random "
-        " request is [random-prefix-len, "
-        " random-prefix-len + random-prefix-len * random-range-ratio).")
    parser.add_argument(
        "--request-rate",
        type=float,
        default=float("inf"),
        help="Number of requests per second. If this is inf, "
        "then all the requests are sent at time 0. "
-        "Otherwise, we use Poisson process to synthesize "
-        "the request arrival times.",
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
    )
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument(
@ -839,6 +1018,11 @@ if __name__ == "__main__":
        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
        " format.",
    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
    parser.add_argument(
        "--percentile-metrics",
        type=str,
@ -856,6 +1040,97 @@ if __name__ == "__main__":
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
    )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        "pairs, where the key is a metric name, and the value is in "
+        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        "separated by spaces. Allowed request level metric names are "
+        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+
+    # group for dataset specific arguments
+    sonnet_group = parser.add_argument_group("sonnet dataset options")
+    sonnet_group.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help=
+        "Number of input tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help=
+        "Number of output tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help=
+        "Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+
+    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
+    sharegpt_group.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.")
+
+    random_group = parser.add_argument_group("random dataset options")
+    random_group.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help=
+        "Number of input tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help=
+        "Number of output tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-prefix-len",
+        type=int,
+        default=0,
+        help="Number of fixed prefix tokens before random "
+        " context. The length range of context in a random "
+        " request is [random-prefix-len, "
+        " random-prefix-len + random-prefix-len * random-range-ratio).")
+
+    hf_group = parser.add_argument_group("hf dataset options")
+    hf_group.add_argument("--hf-subset",
+                          type=str,
+                          default=None,
+                          help="Subset of the HF dataset.")
+    hf_group.add_argument("--hf-split",
+                          type=str,
+                          default=None,
+                          help="Split of the HF dataset.")
+    hf_group.add_argument(
+        "--hf-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output lengths "
+        "from the sampled HF dataset.",
+    )

    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -1,29 +1,71 @@
 """Benchmark offline inference throughput."""
 import argparse
+import dataclasses
 import json
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import List, Optional

 import torch
 import uvloop
+from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)

-from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.inputs import TextPrompt
+from vllm.multimodal import MultiModalDataDict
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators


-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[MultiModalDataDict] = None
+
+
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")

@ -32,23 +74,36 @@ def sample_requests(
        dataset = json.load(f)
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
    # Shuffle the dataset.
    random.shuffle(dataset)

    # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for i in range(len(dataset)):
+    filtered_dataset: List[SampleRequest] = []
+    for data in dataset:
        if len(filtered_dataset) == num_requests:
            break

+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path,
+                              str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert(
+                    "RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
        # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
@ -59,158 +114,88 @@ def sample_requests(
        if prompt_len > 1024 or prompt_len + output_len > 2048:
            # Prune too long sequences.
            continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
+        filtered_dataset.append(
+            SampleRequest(prompt=prompt,
+                          prompt_len=prompt_len,
+                          expected_output_len=output_len,
+                          multi_modal_data=multi_modal_data))

    return filtered_dataset


 def run_vllm(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: str,
-    quantization: Optional[str],
-    tensor_parallel_size: int,
-    seed: int,
+    requests: List[SampleRequest],
    n: int,
-    use_beam_search: bool,
-    trust_remote_code: bool,
-    dtype: str,
-    max_model_len: Optional[int],
-    enforce_eager: bool,
-    kv_cache_dtype: str,
-    quantization_param_path: Optional[str],
-    device: str,
-    enable_prefix_caching: bool,
-    enable_chunked_prefill: bool,
-    max_num_batched_tokens: int,
-    distributed_executor_backend: Optional[str],
-    gpu_memory_utilization: float = 0.9,
-    num_scheduler_steps: int = 1,
-    use_v2_block_manager: bool = False,
-    download_dir: Optional[str] = None,
-    load_format: str = EngineArgs.load_format,
-    disable_async_output_proc: bool = False,
+    engine_args: EngineArgs,
 ) -> float:
    from vllm import LLM, SamplingParams
-    llm = LLM(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        quantization_param_path=quantization_param_path,
-        device=device,
-        enable_prefix_caching=enable_prefix_caching,
-        download_dir=download_dir,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
-        load_format=load_format,
-        num_scheduler_steps=num_scheduler_steps,
-        use_v2_block_manager=use_v2_block_manager,
-        disable_async_output_proc=disable_async_output_proc,
-    )
+    llm = LLM(**dataclasses.asdict(engine_args))

    # Add the requests to the engine.
-    prompts: List[str] = []
+    prompts: List[TextPrompt] = []
    sampling_params: List[SamplingParams] = []
-    for prompt, _, output_len in requests:
-        prompts.append(prompt)
+    for request in requests:
+        prompts.append(
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
        sampling_params.append(
            SamplingParams(
                n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                top_p=1.0,
-                use_beam_search=use_beam_search,
                ignore_eos=True,
-                max_tokens=output_len,
+                max_tokens=request.expected_output_len,
            ))

-    start = time.perf_counter()
-    llm.generate(prompts, sampling_params, use_tqdm=True)
-    end = time.perf_counter()
+    use_beam_search = False
+
+    if not use_beam_search:
+        start = time.perf_counter()
+        llm.generate(prompts, sampling_params, use_tqdm=True)
+        end = time.perf_counter()
+    else:
+        prompts = [request.prompt for request in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0][2]
+        for request in requests:
+            assert request.expected_output_len == output_len
+        start = time.perf_counter()
+        llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ))
+        end = time.perf_counter()
    return end - start


 async def run_vllm_async(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: str,
-    quantization: Optional[str],
-    tensor_parallel_size: int,
-    seed: int,
+    requests: List[SampleRequest],
    n: int,
-    use_beam_search: bool,
-    trust_remote_code: bool,
-    dtype: str,
-    max_model_len: Optional[int],
-    enforce_eager: bool,
-    kv_cache_dtype: str,
-    quantization_param_path: Optional[str],
-    device: str,
-    enable_prefix_caching: bool,
-    enable_chunked_prefill: bool,
-    max_num_batched_tokens: int,
-    distributed_executor_backend: Optional[str],
-    gpu_memory_utilization: float = 0.9,
-    num_scheduler_steps: int = 1,
-    use_v2_block_manager: bool = False,
-    download_dir: Optional[str] = None,
-    load_format: str = EngineArgs.load_format,
-    disable_async_output_proc: bool = False,
+    engine_args: AsyncEngineArgs,
    disable_frontend_multiprocessing: bool = False,
 ) -> float:
    from vllm import SamplingParams
-    engine_args = AsyncEngineArgs(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        quantization_param_path=quantization_param_path,
-        device=device,
-        enable_prefix_caching=enable_prefix_caching,
-        download_dir=download_dir,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
-        load_format=load_format,
-        num_scheduler_steps=num_scheduler_steps,
-        use_v2_block_manager=use_v2_block_manager,
-        disable_async_output_proc=disable_async_output_proc,
-        worker_use_ray=False,
-        engine_use_ray=False,
-        disable_log_requests=True,
-    )

    async with build_async_engine_client_from_engine_args(
            engine_args, disable_frontend_multiprocessing) as llm:

        # Add the requests to the engine.
-        prompts: List[str] = []
+        prompts: List[TextPrompt] = []
        sampling_params: List[SamplingParams] = []
-        for prompt, _, output_len in requests:
-            prompts.append(prompt)
+        for request in requests:
+            prompts.append(
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
            sampling_params.append(
                SamplingParams(
                    n=n,
-                    temperature=0.0 if use_beam_search else 1.0,
+                    temperature=1.0,
                    top_p=1.0,
-                    use_beam_search=use_beam_search,
                    ignore_eos=True,
-                    max_tokens=output_len,
+                    max_tokens=request.expected_output_len,
                ))

        generators = []
@ -226,15 +211,13 @@ async def run_vllm_async(


 def run_hf(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
    model: str,
    tokenizer: PreTrainedTokenizerBase,
    n: int,
-    use_beam_search: bool,
    max_batch_size: int,
    trust_remote_code: bool,
 ) -> float:
-    assert not use_beam_search
    llm = AutoModelForCausalLM.from_pretrained(
        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
    if llm.config.model_type == "llama":
@ -266,7 +249,7 @@ def run_hf(
                              padding=True).input_ids
        llm_outputs = llm.generate(
            input_ids=input_ids.cuda(),
-            do_sample=not use_beam_search,
+            do_sample=True,
            num_return_sequences=n,
            temperature=1.0,
            top_p=1.0,
@ -286,14 +269,14 @@ def run_hf(


 def run_mii(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
    model: str,
    tensor_parallel_size: int,
    output_len: int,
 ) -> float:
    from mii import client, serve
    llm = serve(model, tensor_parallel=tensor_parallel_size)
-    prompts = [prompt for prompt, _, _ in requests]
+    prompts = [request.prompt for request in requests]

    start = time.perf_counter()
    llm.generate(prompts, max_new_tokens=output_len)
@ -312,46 +295,60 @@ def main(args: argparse.Namespace):
        args.tokenizer, trust_remote_code=args.trust_remote_code)
    if args.dataset is None:
        # Synthesize a prompt with the given input length.
-        prompt = "hi" * (args.input_len - 1)
-        requests = [(prompt, args.input_len, args.output_len)
-                    for _ in range(args.num_prompts)]
-    else:
-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
-                                   args.output_len)
-
-    if args.backend == "vllm":
-        run_args = [
-            requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-            args.trust_remote_code, args.dtype, args.max_model_len,
-            args.enforce_eager, args.kv_cache_dtype,
-            args.quantization_param_path, args.device,
-            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.distributed_executor_backend,
-            args.gpu_memory_utilization, args.num_scheduler_steps,
-            args.use_v2_block_manager, args.download_dir, args.load_format,
-            args.disable_async_output_proc
-        ]
-
-        if args.async_engine:
-            run_args.append(args.disable_frontend_multiprocessing)
-            elapsed_time = uvloop.run(run_vllm_async(*run_args))
+        # As tokenizer may add additional tokens like BOS, we need to try
+        # different lengths to get the desired input length.
+        for i in range(-10, 10):
+            prompt = "hi " * (args.input_len + i)
+            tokenized_prompt = tokenizer(prompt).input_ids
+            if len(tokenized_prompt) == args.input_len:
+                break
        else:
-            elapsed_time = run_vllm(*run_args)
+            raise ValueError(
+                f"Failed to synthesize a prompt with {args.input_len} tokens.")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=args.input_len,
+                          expected_output_len=args.output_len)
+            for _ in range(args.num_prompts)
+        ]
+    else:
+        requests = sample_requests(tokenizer, args)
+
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                ))
+        else:
+            elapsed_time = run_vllm(requests, args.n,
+                                    EngineArgs.from_cli_args(args))
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code)
+                              args.hf_max_batch_size, args.trust_remote_code)
    elif args.backend == "mii":
        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                               args.output_len)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(prompt_len + output_len
-                           for _, prompt_len, output_len in requests)
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                           for request in requests)
+    total_output_tokens = sum(request.expected_output_len
+                              for request in requests)
+    if is_multi_modal:
+        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+              "following metrics are not accurate because image tokens are not"
+              " counted. See vllm-project/vllm/issues/9778 for details.")
+        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")

    # Output JSON results if specified
    if args.output_json:
@ -375,7 +372,9 @@ if __name__ == "__main__":
    parser.add_argument("--dataset",
                        type=str,
                        default=None,
-                        help="Path to the dataset.")
+                        help="Path to the dataset. The dataset is expected to "
+                        "be a json in form of List[Dict[..., conversations: "
+                        "List[Dict[..., value: <prompt_or_response>]]]]")
    parser.add_argument("--input-len",
                        type=int,
                        default=None,
@ -385,142 +384,23 @@ if __name__ == "__main__":
                        default=None,
                        help="Output length for each request. Overrides the "
                        "output length from the dataset.")
-    parser.add_argument("--model", type=str, default="facebook/opt-125m")
-    parser.add_argument("--tokenizer", type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=[*QUANTIZATION_METHODS, None],
-                        default=None)
-    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
    parser.add_argument("--n",
                        type=int,
                        default=1,
                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument("--num-prompts",
                        type=int,
                        default=1000,
                        help="Number of prompts to process.")
-    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--hf-max-batch-size",
                        type=int,
                        default=None,
                        help="Maximum batch size for HF backend.")
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    parser.add_argument('--gpu-memory-utilization',
-                        type=float,
-                        default=0.9,
-                        help='the fraction of GPU memory to be used for '
-                        'the model executor, which can range from 0 to 1.'
-                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument("--enforce-eager",
-                        action="store_true",
-                        help="enforce eager execution")
-    parser.add_argument(
-        '--kv-cache-dtype',
-        type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-        default="auto",
-        help='Data type for kv cache storage. If "auto", will use model '
-        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-    parser.add_argument(
-        '--quantization-param-path',
-        type=str,
-        default=None,
-        help='Path to the JSON file containing the KV cache scaling factors. '
-        'This should generally be supplied, when KV cache dtype is FP8. '
-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
-        'instead supported for common inference criteria.')
-    parser.add_argument("--device",
-                        type=str,
-                        default="auto",
-                        choices=DEVICE_OPTIONS,
-                        help='device type for vLLM execution')
-    parser.add_argument(
-        "--num-scheduler-steps",
-        type=int,
-        default=1,
-        help="Maximum number of forward steps per scheduler call.")
-    parser.add_argument("--use-v2-block-manager",
-                        action='store_true',
-                        help="Enable block manager v2.")
-    parser.add_argument(
-        "--enable-prefix-caching",
-        action='store_true',
-        help="Enable automatic prefix caching for vLLM backend.")
-    parser.add_argument("--enable-chunked-prefill",
-                        action='store_true',
-                        help="enable chunked prefill for vLLM backend.")
-    parser.add_argument('--max-num-batched-tokens',
-                        type=int,
-                        default=None,
-                        help='maximum number of batched tokens per '
-                        'iteration')
-    parser.add_argument('--download-dir',
-                        type=str,
-                        default=None,
-                        help='directory to download and load the weights, '
-                        'default to the default cache dir of huggingface')
    parser.add_argument(
        '--output-json',
        type=str,
        default=None,
        help='Path to save the throughput results in JSON format.')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    parser.add_argument(
-        '--load-format',
-        type=str,
-        default=EngineArgs.load_format,
-        choices=[
-            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-            'bitsandbytes'
-        ],
-        help='The format of the model weights to load.\n\n'
-        '* "auto" will try to load the weights in the safetensors format '
-        'and fall back to the pytorch bin format if safetensors format '
-        'is not available.\n'
-        '* "pt" will load the weights in the pytorch bin format.\n'
-        '* "safetensors" will load the weights in the safetensors format.\n'
-        '* "npcache" will load the weights in pytorch format and store '
-        'a numpy cache to speed up the loading.\n'
-        '* "dummy" will initialize the weights with random values, '
-        'which is mainly for profiling.\n'
-        '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-        'section for more information.\n'
-        '* "bitsandbytes" will load the weights using bitsandbytes '
-        'quantization.\n')
-    parser.add_argument(
-        "--disable-async-output-proc",
-        action='store_true',
-        default=False,
-        help="Disable async output processor for vLLM backend.")
    parser.add_argument("--async-engine",
                        action='store_true',
                        default=False,
@ -529,6 +409,7 @@ if __name__ == "__main__":
                        action='store_true',
                        default=False,
                        help="Disable decoupled async engine frontend.")
+    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
@ -551,8 +432,6 @@ if __name__ == "__main__":
            raise ValueError("dtype must be auto for MII backend.")
        if args.n != 1:
            raise ValueError("n must be 1 for MII backend.")
-        if args.use_beam_search:
-            raise ValueError("Beam search is not supported for MII backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
        if args.hf_max_batch_size is not None:
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -1,9 +1,9 @@
-import random
 import time

 import torch

 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser


@ -16,10 +16,7 @@ def main(num_tokens: int,
         do_profile: bool = False,
         num_warmup_iters: int = 5,
         num_iters: int = 100) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    current_platform.seed_everything(seed)
    torch.set_default_device("cuda")

    layer = RMSNorm(hidden_size).to(dtype=dtype)
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -4,8 +4,10 @@ import itertools
 import math
 import pickle as pkl
 import time
-from typing import Callable, Iterable, List, Tuple
+from itertools import product
+from typing import Callable, Iterable, List, Optional, Tuple

+import pandas as pd
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
@ -84,6 +86,10 @@ def loop_over_weights(
        fn(a, w_ref, w_q, w_s)


+_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
+_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+
+
 def bench(atype: torch.dtype,
          wtype: ScalarType,
          group_size: int,
@ -94,6 +100,8 @@ def bench(atype: torch.dtype,
          sub_label: str,
          benchmark_marlinv1: bool = True,
          sweep_schedules: bool = True) -> Iterable[TMeasurement]:
+    global _SWEEP_SCHEDULES_RESULTS
+
    a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
    sub_label += f", L={len(weights)}"

@ -163,6 +171,11 @@ def bench(atype: torch.dtype,
        best_schedule = None
        schedules = ops.machete_supported_schedules(wtype)
        for schedule in reversed(schedules):
+            schedule_M = int(schedule.split("_")[0].split("x")[1])
+
+            # Prune known bad schedules
+            if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
+                continue

            def run(a, _, w_q, w_s, schedule=schedule):
                ops.machete_gemm(a,
@ -175,6 +188,20 @@ def bench(atype: torch.dtype,
            res = bench_fn(label, sub_label, "machete_best",
                           lambda: loop_over_weights(a, weights_machete, run))

+            results_row = {
+                "M": m,
+                "K": k,
+                "N": n,
+                "group_size": group_size,
+                "schedule": schedule,
+                "median": res.median,
+            }
+            if _SWEEP_SCHEDULES_RESULTS is None:
+                _SWEEP_SCHEDULES_RESULTS = pd.DataFrame(
+                    columns=results_row.keys())
+            _SWEEP_SCHEDULES_RESULTS.\
+                loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
+
            print(f"  {res.median:5.5} ", schedule)
            if not best or res.median < best.median:
                best = res
@ -235,18 +262,22 @@ def run_square_bench(args):
    dim_sizes = list(
        range(args.dim_start, args.dim_end + 1, args.dim_increment))
    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+
    data = run(args.dtype, args.sweep_schedules, MKNs)

    make_output(data, MKNs, f"square_bench-{args.dtype}")


 def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
+    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
+    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
+    m_increment, k_increment, n_increment = \
+        (int(x) for x in args.dim_increment.split(","))
+    Ms = list(range(m_start, m_end + 1, m_increment))
+    Ks = list(range(k_start, k_end + 1, k_increment))
+    Ns = list(range(n_start, n_end + 1, n_increment))
+    MKNs = list(product(Ms, Ks, Ns))
+
    data = run(args.dtype, args.sweep_schedules, MKNs)

    make_output(data, MKNs, f"range_bench-{args.dtype}")
@ -333,6 +364,9 @@ Benchmark Machete GEMM.
        action="store_true",
        help="Run a sweep over all supported schedules",
    )
+    parser.add_argument("--sweep-csv-out",
+                        help="CSV to store sweep results",
+                        default="sch_sweep_results.csv")
    subparsers = parser.add_subparsers(dest="cmd", required=True)

    square_parser = subparsers.add_parser("square_bench")
@ -342,12 +376,21 @@ Benchmark Machete GEMM.
    square_parser.set_defaults(func=run_square_bench)

    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.add_argument(
+        "--dim-start",
+        type=str,
+        required=True,
+        help="Start value for M,K,N as common separated list")
+    range_parser.add_argument(
+        "--dim-end",
+        type=str,
+        required=True,
+        help="End value (inclusive) for M,K,N as common separated list")
+    range_parser.add_argument(
+        "--dim-increment",
+        type=str,
+        required=True,
+        help="Increment value for M,K,N as common separated list")
    range_parser.set_defaults(func=run_range_bench)

    model_parser = subparsers.add_parser("model_bench")
@ -369,4 +412,9 @@ Benchmark Machete GEMM.
    model_parser.set_defaults(func=run_model_bench)

    args = parser.parse_args()
+
+    _SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
    args.func(args)
+
+    if _SWEEP_SCHEDULES_RESULTS is not None:
+        _SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -10,6 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
 from transformers import AutoConfig

 from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser


@ -88,22 +89,23 @@ def benchmark_config(
        input_gating.copy_(gating_output[i])

    def run():
-        fused_moe(
-            x,
-            w1,
-            w2,
-            input_gating,
-            topk,
-            renormalize=True,
-            inplace=True,
-            override_config=config,
-            use_fp8_w8a8=use_fp8_w8a8,
-            use_int8_w8a16=use_int8_w8a16,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-        )
+        from vllm.model_executor.layers.fused_moe import override_config
+        with override_config(config):
+            fused_moe(
+                x,
+                w1,
+                w2,
+                input_gating,
+                topk,
+                renormalize=True,
+                inplace=True,
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+            )

    # JIT compilation & warmup
    run()
@ -166,7 +168,7 @@ class BenchmarkWorker:

    def __init__(self, seed: int) -> None:
        torch.set_default_device("cuda")
-        torch.cuda.manual_seed_all(seed)
+        current_platform.seed_everything(seed)
        self.seed = seed

    def benchmark(
@ -180,7 +182,7 @@ class BenchmarkWorker:
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
    ) -> Tuple[Dict[str, int], float]:
-        torch.cuda.manual_seed_all(self.seed)
+        current_platform.seed_everything(self.seed)
        dtype_str = get_config_dtype_str(dtype,
                                         use_int8_w8a16=use_int8_w8a16,
                                         use_fp8_w8a8=use_fp8_w8a8)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -5,6 +5,7 @@ from typing import List, Optional
 import torch

 from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
                        create_kv_caches_with_random)

@ -28,10 +29,7 @@ def main(
    device: str = "cuda",
    kv_cache_dtype: Optional[str] = None,
 ) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    current_platform.seed_everything(seed)

    scale = float(1.0 / (head_size**0.5))
    query = torch.empty(num_seqs,
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -1,9 +1,9 @@
-import random
 import time

 import torch

 from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser


@ -17,10 +17,7 @@ def main(num_tokens: int,
         do_profile: bool = False,
         num_warmup_iters: int = 5,
         num_iters: int = 100) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    current_platform.seed_everything(seed)
    torch.set_default_device("cuda")

    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -6,6 +6,7 @@ import torch

 from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
                                                         get_rope)
+from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser


@ -22,9 +23,7 @@ def benchmark_rope_kernels_multi_lora(
    max_position: int = 8192,
    base: int = 10000,
 ) -> None:
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    current_platform.seed_everything(seed)
    torch.set_default_device(device)
    if rotary_dim is None:
        rotary_dim = head_size
@ -33,7 +32,7 @@ def benchmark_rope_kernels_multi_lora(
    # batched RoPE can take multiple scaling factors
    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
                            is_neox_style, {
-                                "type": "linear",
+                                "rope_type": "linear",
                                "factor": tuple(scaling_factors)
                            })
    # non-batched RoPE takes only one scaling factor, we create multiple
@ -43,7 +42,7 @@ def benchmark_rope_kernels_multi_lora(
        non_batched_ropes.append(
            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
                     {
-                         "type": "linear",
+                         "rope_type": "linear",
                         "factor": (scaling_factor, )
                     }))

--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@ -45,8 +45,7 @@ if __name__ == "__main__":
    rows = int(math.ceil(len(results) / 2))
    fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
    axs = axs.flatten()
-    axs_idx = 0
-    for shape, data in results.items():
+    for axs_idx, (shape, data) in enumerate(results.items()):
        plt.sca(axs[axs_idx])
        df = pd.DataFrame(data)
        sns.lineplot(data=df,
@ -59,6 +58,5 @@ if __name__ == "__main__":
                     palette="Dark2")
        plt.title(f"Shape: {shape}")
        plt.ylabel("time (median, s)")
-        axs_idx += 1
    plt.tight_layout()
    plt.savefig("graph_machete_bench.pdf")
--- a/benchmarks/kernels/requirements.txt
+++ b/benchmarks/kernels/requirements.txt
@ -0,0 +1 @@
+pandas
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@ -4,13 +4,13 @@ PORT=8000
 MODEL=$1
 TOKENS=$2

-docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
-           -v $PWD/data:/data \
+docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
+           -v "$PWD/data:/data" \
           ghcr.io/huggingface/text-generation-inference:2.2.0 \
-           --model-id $MODEL \
+           --model-id "$MODEL" \
           --sharded false  \
           --max-input-length 1024 \
           --max-total-tokens 2048 \
           --max-best-of 5 \
           --max-concurrent-requests 5000 \
-           --max-batch-total-tokens $TOKENS
+           --max-batch-total-tokens "$TOKENS"
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@ -16,7 +16,6 @@ def main(args):
        enforce_eager=True,
        enable_prefix_caching=True,
        tensor_parallel_size=args.tensor_parallel_size,
-        use_v2_block_manager=args.use_v2_block_manager,
    )

    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
@ -56,8 +55,5 @@ if __name__ == "__main__":
    parser.add_argument('--enable-prefix-caching',
                        action='store_true',
                        help='enable prefix caching')
-    parser.add_argument('--use-v2-block-manager',
-                        action='store_true',
-                        help='Use BlockSpaceMangerV2')
    args = parser.parse_args()
    main(args)
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -1,5 +1,8 @@
+include(FetchContent)
+
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_CXX_STANDARD 17)

 #
 # Define environment variables for special configurations
@ -15,6 +18,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 list(APPEND CXX_COMPILE_FLAGS
    "-fopenmp"
+    "-mf16c"
    "-DVLLM_CPU_EXTENSION")

 execute_process(COMMAND cat /proc/cpuinfo
@ -82,9 +86,39 @@ else()
    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
 endif()

+#
+# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
+#
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
+    FetchContent_Declare(
+        oneDNN
+        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+        GIT_TAG  v3.6
+        GIT_PROGRESS TRUE
+        GIT_SHALLOW TRUE
+    )
+
+    set(ONEDNN_LIBRARY_TYPE "STATIC")
+    set(ONEDNN_BUILD_DOC "OFF")
+    set(ONEDNN_BUILD_EXAMPLES "OFF")
+    set(ONEDNN_BUILD_TESTS "OFF")
+    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
+    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+    set(ONEDNN_BUILD_GRAPH "OFF")
+    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
+    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
+    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
+    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+    FetchContent_MakeAvailable(oneDNN)
+    
+    list(APPEND LIBS dnnl)
+endif()
+
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")

-list(APPEND LIBS dnnl numa)
+list(APPEND LIBS numa)

 #
 # _C extension
@ -120,4 +154,3 @@ define_gpu_extension_target(
 )

 message(STATUS "Enabling C extension.")
-add_dependencies(default _C)
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -133,10 +133,181 @@ macro(string_to_ver OUT_VER IN_STR)
  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
 endmacro()

+#
+# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
+# `CUDA_ARCH_FLAGS`.
+#
+# Example:
+#   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
+#   clear_cuda_arches(CUDA_ARCH_FLAGS)
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
+#   CMAKE_CUDA_FLAGS="-Wall"
+#
+macro(clear_cuda_arches CUDA_ARCH_FLAGS)
+    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
+    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+
+    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+    # and passed back via the `CUDA_ARCHITECTURES` property.
+    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+endmacro()
+
+#
+# Extract unique CUDA architectures from a list of compute capabilities codes in 
+# the form `<major><minor>[<letter>]`, convert them to the form sort 
+# `<major>.<minor>`, dedupes them and then sorts them in ascending order and 
+# stores them in `OUT_ARCHES`.
+#
+# Example:
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" 
+#   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
+#   OUT_ARCHES="7.5;...;9.0"
+function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
+  set(_CUDA_ARCHES)
+  foreach(_ARCH ${CUDA_ARCH_FLAGS})
+    string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
+    if (_COMPUTE)
+      set(_COMPUTE ${CMAKE_MATCH_1})
+    endif()
+
+    string_to_ver(_COMPUTE_VER ${_COMPUTE})
+    list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHES)
+  list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
+  set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
+endfunction()
+
+#
+# For a specific file set the `-gencode` flag in compile options conditionally 
+# for the CUDA language. 
+#
+# Example:
+#   set_gencode_flag_for_srcs(
+#     SRCS "foo.cu"
+#     ARCH "compute_75"
+#     CODE "sm_75")
+#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for 
+#    `foo.cu` (only for the CUDA language).
+#
+macro(set_gencode_flag_for_srcs)
+  set(options)
+  set(oneValueArgs ARCH CODE)
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+  set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
+  set_property(
+    SOURCE ${arg_SRCS}
+    APPEND PROPERTY
+    COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
+  )
+
+  message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
+endmacro(set_gencode_flag_for_srcs)
+
+#
+# For a list of source files set the `-gencode` flags in the files specific 
+#  compile options (specifically for the CUDA language).
+#
+# arguments are:
+#  SRCS: list of source files
+#  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
+#  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
+#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS 
+#    that is larger than BUILD_PTX_FOR_ARCH.
+#
+macro(set_gencode_flags_for_srcs)
+  set(options)
+  set(oneValueArgs BUILD_PTX_FOR_ARCH)
+  set(multiValueArgs SRCS CUDA_ARCHS)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+
+  foreach(_ARCH ${arg_CUDA_ARCHS})
+    string(REPLACE "." "" _ARCH "${_ARCH}")
+    set_gencode_flag_for_srcs(
+      SRCS ${arg_SRCS}
+      ARCH "compute_${_ARCH}"
+      CODE "sm_${_ARCH}")
+  endforeach()
+
+  if (${arg_BUILD_PTX_FOR_ARCH})
+    list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+    list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
+    if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
+      string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_PTX_ARCH}"
+        CODE "compute_${_PTX_ARCH}")
+    endif()
+  endif()
+endmacro()
+
+#
+# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form 
+#  `<major>.<minor>[letter]` compute the "loose intersection" with the 
+#  `TGT_CUDA_ARCHS` list of gencodes. 
+# The loose intersection is defined as:
+#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
+#  where `<=` is the version comparison operator.
+# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
+#  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
+# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
+#  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
+#  9.0a to the result. 
+# The result is stored in `OUT_CUDA_ARCHS`.
+#
+# Example:
+#   SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
+#   TGT_CUDA_ARCHS="8.0;8.9;9.0"
+#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+#   OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
+#
+function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+  list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
+
+  # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
+  # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
+  set(_CUDA_ARCHS)
+  if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
+    list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
+    if ("9.0" IN_LIST TGT_CUDA_ARCHS)
+      set(_CUDA_ARCHS "9.0a")
+    endif()
+  endif()
+
+  list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
+  # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is 
+  # less or eqault to ARCH
+  foreach(_ARCH ${CUDA_ARCHS})
+  set(_TMP_ARCH)
+  foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
+    if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+      set(_TMP_ARCH ${_SRC_ARCH})
+    else()
+      break()
+    endif()
+  endforeach()
+  if (_TMP_ARCH)
+    list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
+  endif()
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHS)
+  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
+endfunction()
+
 #
 # Override the GPU architectures detected by cmake/torch and filter them by
 # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
-# `GPU_ARCHES`.
+# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set 
+# the architectures on a per file basis.
 #
 # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
 #
@ -174,109 +345,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
    endif()
-
-  elseif(${GPU_LANG} STREQUAL "CUDA")
-    #
-    # Setup/process CUDA arch flags.
-    #
-    # The torch cmake setup hardcodes the detected architecture flags in
-    # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
-    # can't modified on a per-target basis.
-    # So, all the `-gencode` flags need to be extracted and removed from
-    # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
-    # Since it's not possible to use `target_compiler_options` for adding target
-    # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
-    # must be used instead.  This requires repackaging the architecture flags
-    # into a format that cmake expects for `CUDA_ARCHITECTURES`.
-    #
-    # This is a bit fragile in that it depends on torch using `-gencode` as opposed
-    # to one of the other nvcc options to specify architectures.
-    #
-    # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
-    # detected architectures.
-    #
-    message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
-
-    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
-    string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
-      ${CMAKE_CUDA_FLAGS})
-
-    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
-    # and passed back via the `CUDA_ARCHITECTURES` property.
-    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
-      ${CMAKE_CUDA_FLAGS})
-
-    # If this error is triggered, it might mean that torch has changed how it sets
-    # up nvcc architecture code generation flags.
-    if (NOT _CUDA_ARCH_FLAGS)
-      message(FATAL_ERROR
-        "Could not find any architecture related code generation flags in "
-        "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
-    endif()
-
-    message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
-    message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
-
-    # Initialize the architecture lists to empty.
-    set(${GPU_ARCHES})
-
-    # Process each `gencode` flag.
-    foreach(_ARCH ${_CUDA_ARCH_FLAGS})
-      # For each flag, extract the version number and whether it refers to PTX
-      # or native code.
-      # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
-      # for that match.
-
-      string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
-      if (_COMPUTE)
-        set(_COMPUTE ${CMAKE_MATCH_1})
-      endif()
-
-      string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
-      if (_SM)
-        set(_SM ${CMAKE_MATCH_1})
-      endif()
-
-      string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
-      if (_CODE)
-        set(_CODE ${CMAKE_MATCH_1})
-      endif()
-
-      # Make sure the virtual architecture can be matched.
-      if (NOT _COMPUTE)
-        message(FATAL_ERROR
-          "Could not determine virtual architecture from: ${_ARCH}.")
-      endif()
-
-      # One of sm_ or compute_ must exist.
-      if ((NOT _SM) AND (NOT _CODE))
-        message(FATAL_ERROR
-          "Could not determine a codegen architecture from: ${_ARCH}.")
-      endif()
-
-      if (_SM)
-        # -real suffix let CMake to only generate elf code for the kernels.
-        # we want this, otherwise the added ptx (default) will increase binary size.
-        set(_VIRT "-real")
-        set(_CODE_ARCH ${_SM})
-      else()
-        # -virtual suffix let CMake to generate ptx code for the kernels.
-        set(_VIRT "-virtual")
-        set(_CODE_ARCH ${_CODE})
-      endif()
-
-      # Check if the current version is in the supported arch list.
-      string_to_ver(_CODE_VER ${_CODE_ARCH})
-      if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
-        message(STATUS "discarding unsupported CUDA arch ${_VER}.")
-        continue()
-      endif()
-
-      # Add it to the arch list.
-      list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
-    endforeach()
  endif()
-  message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
 endmacro()

 #
@ -350,18 +419,15 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
    ${GPU_INCLUDE_DIRECTORIES})

-  # TODO: is torch_python_LIBRARY needed?
-  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
-    ${GPU_LIBRARIES})
+  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})

  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
  # dependencies that are not necessary and may not be installed.
  if (GPU_LANGUAGE STREQUAL "CUDA")
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
-      ${CUDA_LIBRARIES})
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
  else()
    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
  endif()

-  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
+  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
 endfunction()
--- a/collect_env.py
+++ b/collect_env.py
@ -1,17 +1,19 @@
 # ruff: noqa
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py

-# Unlike the rest of the PyTorch this file must be python2 compliant.
-# This script outputs relevant system environment info
-# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
 import datetime
 import locale
 import os
 import re
 import subprocess
 import sys
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
 from collections import namedtuple

+from vllm.envs import environment_variables
+
 try:
    import torch
    TORCH_AVAILABLE = True
@ -52,6 +54,7 @@ SystemEnv = namedtuple(
        'vllm_version',  # vllm specific field
        'vllm_build_flags',  # vllm specific field
        'gpu_topo',  # vllm specific field
+        'env_vars',
    ])

 DEFAULT_CONDA_PATTERNS = {
@ -267,13 +270,16 @@ def get_neuron_sdk_version(run_lambda):


 def get_vllm_version():
-    try:
-        import vllm
-        return vllm.__version__ + "@" + vllm.__commit__
-    except Exception:
-        # old version of vllm does not have __commit__
-        return 'N/A'
+    from vllm import __version__, __version_tuple__

+    if __version__ == "dev":
+        return "N/A (dev)"
+
+    if len(__version_tuple__) == 4: # dev build
+        git_sha = __version_tuple__[-1][1:] # type: ignore
+        return f"{__version__} (git sha: {git_sha}"
+
+    return __version__

 def summarize_vllm_build_flags():
    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
@ -285,9 +291,14 @@ def summarize_vllm_build_flags():


 def get_gpu_topo(run_lambda):
+    output = None
+
    if get_platform() == 'linux':
-        return run_and_read_all(run_lambda, 'nvidia-smi topo -m')
-    return None
+        output = run_and_read_all(run_lambda, 'nvidia-smi topo -m')
+        if output is None:
+            output = run_and_read_all(run_lambda, 'rocm-smi --showtopo')
+
+    return output


 # example outputs of CPU infos
@ -504,6 +515,22 @@ def is_xnnpack_available():
    else:
        return "N/A"

+def get_env_vars():
+    env_vars = ''
+    secret_terms=('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "NCCL", "PYTORCH",
+                     "CUDA", "CUBLAS", "CUDNN",
+                     "OMP_", "MKL_",
+                     "NVIDIA")
+    for k, v in os.environ.items():
+        if any(term in k.lower() for term in secret_terms):
+            continue
+        if k in environment_variables:
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+        if k.startswith(report_prefix):
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+
+    return env_vars

 def get_env_info():
    run_lambda = run
@ -575,6 +602,7 @@ def get_env_info():
        vllm_version=vllm_version,
        vllm_build_flags=vllm_build_flags,
        gpu_topo=gpu_topo,
+        env_vars=get_env_vars(),
    )


@ -623,6 +651,8 @@ vLLM Build Flags:
 {vllm_build_flags}
 GPU Topology:
 {gpu_topo}
+
+{env_vars}
 """.strip()


--- a/Show More
+++ b/Show More