add compile

2024-07-26 19:29:36 -07:00
1020 changed files with 29094 additions and 121057 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,43 +1,36 @@
 import os
-import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
+MAX_SIZE_MB = 250


 def print_top_10_largest_files(zip_file):
-    """Print the top 10 largest files in the given zip file."""
    with zipfile.ZipFile(zip_file, 'r') as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
-            print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
+            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")


 def check_wheel_size(directory):
-    """Check the size of .whl files in the given directory."""
    for root, _, files in os.walk(directory):
-        for file_name in files:
-            if file_name.endswith(".whl"):
-                wheel_path = os.path.join(root, file_name)
-                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
-                if wheel_size_mb > VLLM_MAX_SIZE_MB:
-                    print(f"Not allowed: Wheel {wheel_path} is larger "
-                          f"({wheel_size_mb:.2f} MB) than the limit "
-                          f"({VLLM_MAX_SIZE_MB} MB).")
+        for f in files:
+            if f.endswith(".whl"):
+                wheel_path = os.path.join(root, f)
+                wheel_size = os.path.getsize(wheel_path)
+                wheel_size_mb = wheel_size / (1024 * 1024)
+                if wheel_size_mb > MAX_SIZE_MB:
+                    print(
+                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
+                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
                    print(f"Wheel {wheel_path} is within the allowed size "
-                          f"({wheel_size_mb:.2f} MB).")
+                          f"({wheel_size_mb} MB).")
    return 0


 if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print("Usage: python check-wheel-size.py <directory>")
-        sys.exit(1)
-
-    directory = sys.argv[1]
-    sys.exit(check_wheel_size(directory))
+    import sys
+    sys.exit(check_wheel_size(sys.argv[1]))
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -9,4 +9,3 @@ tasks:
    value: 0.664
 limit: 1000
 num_fewshot: 5
-trust_remote_code: True
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
-model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.764
-  - name: "exact_match,flexible-extract"
-    value: 0.764
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
-model_name: "HandH1998/QQQ-Llama-3-8b-g128"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.419
-  - name: "exact_match,flexible-extract"
-    value: 0.416
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
-model_name: "mgoin/Minitron-4B-Base-FP8"
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
+model_name: "nvidia/Minitron-4B-Base"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.233
+    value: 0.252
  - name: "exact_match,flexible-extract"
-    value: 0.236
+    value: 0.252
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,10 +1,9 @@
 Meta-Llama-3-8B-Instruct.yaml
+Meta-Llama-3-8B-Instruct-FP8.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base-FP8.yaml
+Minitron-4B-Base.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
-Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.4
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.4
+#   pip install lm-eval==0.4.3

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -14,7 +14,7 @@ import lm_eval
 import numpy
 import yaml

-RTOL = 0.05
+RTOL = 0.02
 TEST_DATA_FILE = os.environ.get(
    "LM_EVAL_TEST_DATA_FILE",
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
@ -23,12 +23,9 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)


 def launch_lm_eval(eval_config):
-    trust_remote_code = eval_config.get('trust_remote_code', False)
-
    model_args = f"pretrained={eval_config['model_name']}," \
                 f"tensor_parallel_size={TP_SIZE}," \
-                 f"add_bos_token=true," \
-                 f"trust_remote_code={trust_remote_code}"
+                 f"add_bos_token=true"

    results = lm_eval.simple_evaluate(
        model="vllm",
@ -49,15 +46,10 @@ def test_lm_eval_correctness():
    results = launch_lm_eval(eval_config)

    # Confirm scores match ground truth.
-    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
            print(f'{task["name"]} | {metric["name"]}: '
                  f'ground_truth={ground_truth} | measured={measured_value}')
-            success = success and numpy.isclose(
-                ground_truth, measured_value, rtol=RTOL)
-
-    # Assert at the end, print all scores even on failure for debugging.
-    assert success
+            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -34,18 +34,17 @@ See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan

 Performance benchmark will be triggered when:
 - A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
+- Every commit for those PRs with `perf-benchmarks` label.

 Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
+- Every commit for those PRs with `nightly-benchmarks` label.




 ## Performance benchmark details

-
-See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
+See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.


 #### Latency test
@ -69,7 +68,7 @@ Here is an example of one test inside `latency-tests.json`:

 In this example:
 -  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`

 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.

--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -8,7 +8,8 @@ steps:
          containers:
          - image: badouralix/curl-jq
            command:
-            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+            - sh
+            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
  - wait
  - label: "A100"
    agents:
@ -20,7 +21,7 @@ steps:
          containers:
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
            resources:
              limits:
                nvidia.com/gpu: 8
@ -41,20 +42,20 @@ steps:
          - name: devshm
            emptyDir:
              medium: Memory
-  # - label: "H100"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #       command:
-  #       - bash
-  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-  #       mount-buildkite-agent: true
-  #       propagate-environment: true
-  #       ipc: host
-  #       gpus: all
-  #       environment:
-  #       - VLLM_USAGE_SOURCE
-  #       - HF_TOKEN
+  - label: "H100"
+    agents:
+      queue: H100
+    plugins:
+    - docker#v5.11.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN

--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -1,28 +0,0 @@
-
-## Description
-
-This file contains the downloading link for benchmarking results.
-
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
- [benchmarking results](artifact://results.zip)
- [benchmarking code](artifact://nightly-benchmarks.zip)
-
-Please download the visualization scripts in the post
-
-
-## Results reproduction
-
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
-  - Download `nightly-benchmarks.zip`. 
-  - In the same folder, run the following code
-```
-export HF_TOKEN=<your HF token>
-apt update
-apt install -y git
-unzip nightly-benchmarks.zip
-VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-```
-
-And the results will be inside `./benchmarks/results`.
-
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -1,39 +1,45 @@

 # Nightly benchmark

-This benchmark aims to:
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
-
-Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+The main goal of this benchmarking is two-fold:
+- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
+- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().


-## Setup
+## Docker images

- Docker images:
-  - vLLM: `vllm/vllm-openai:v0.6.2`
-  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
-  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
-  - 8x Nvidia A100 GPUs
- Workload:
-  - Dataset
-    - ShareGPT dataset
-    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
-  - Models: llama-3 8B, llama-3 70B.
-    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
-  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
+- vllm/vllm-openai:v0.5.0.post1
+- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+- openmmlab/lmdeploy:v0.5.0
+- ghcr.io/huggingface/text-generation-inference:2.1

-# Known issues
+<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->

- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
- TGI does not support `ignore-eos` flag.
+
+## Hardware
+
+One AWS node with 8x NVIDIA A100 GPUs.
+
+
+## Workload description
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
+
+- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 500 prompts.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+
+<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
+
+## Plots
+
+In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
+
+<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
+
+## Results
+
+{nightly_results_benchmarking_table}
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec

 common_container_settings: &common_container_settings
  command:
-    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
  resources:
    limits:
      nvidia.com/gpu: 8
@ -37,10 +37,7 @@ common_container_settings: &common_container_settings

 steps:
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-
-
-
-  - label: "A100 vllm step 10"
+  - label: "A100 trt benchmark"
    priority: 100
    agents:
      queue: A100
@ -49,21 +46,7 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: vllm/vllm-openai:v0.6.2
-                <<: *common_container_settings
-
-
-
-  - label: "A100 sglang benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: lmsysorg/sglang:v0.3.2-cu121
+              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
                <<: *common_container_settings

  - label: "A100 lmdeploy benchmark"
@ -75,13 +58,11 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: openmmlab/lmdeploy:v0.6.1-cu12
+              - image: openmmlab/lmdeploy:v0.5.0
                <<: *common_container_settings
+  

-
-
-
-  - label: "A100 trt llama-8B"
+  - label: "A100 vllm benchmark"
    priority: 100
    agents:
      queue: A100
@ -90,25 +71,10 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+              - image: vllm/vllm-openai:latest 
                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama8B"

-
-  - label: "A100 trt llama-70B"
+  - label: "A100 tgi benchmark"
    priority: 100
    agents:
      queue: A100
@ -117,54 +83,12 @@ steps:
          podSpec:
            <<: *common_pod_spec
            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+              - image: ghcr.io/huggingface/text-generation-inference:2.1 
                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama70B"
-
-
-  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
-  # - label: "A100 trt benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-  #               <<: *common_container_settings
-
-
-  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
-  # - label: "A100 tgi benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
-  #               <<: *common_container_settings
        
  - wait

-  - label: "Collect the results"
+  - label: "Plot"
    priority: 100
    agents:
      queue: A100
@ -193,4 +117,4 @@ steps:
                    name: hf-token-secret
                    key: token

-  - block: ":rocket: check the results!"
+  - wait
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -34,15 +34,6 @@ check_hf_token() {
  fi
 }

-ensure_sharegpt_downloaded() {
-  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
-  if [ ! -f "$FILE" ]; then
-    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
-  else
-    echo "$FILE already exists."
-  fi
-}
-
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
@ -68,38 +59,40 @@ wait_for_server() {
    done' && return 0 || return 1
 }

-kill_processes_launched_by_current_bash() {
-  # Kill all python processes launched from current bash script
-  current_shell_pid=$$
-  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
-  if [ -n "$processes" ]; then
-    echo "Killing the following processes matching '$1':"
-    echo "$processes"
-    echo "$processes" | xargs kill -9
-  else
-    echo "No processes found matching '$1'."
-  fi
-}
-
 kill_gpu_processes() {
+  # kill all processes on GPU.
+  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
+  if [ -z "$pids" ]; then
+      echo "No GPU processes found."
+  else
+      for pid in $pids; do
+          kill -9 "$pid"
+          echo "Killed process with PID: $pid"
+      done

-  ps -aux
-  lsof -t -i:8000 | xargs -r kill -9
-  pkill -f pt_main_thread
-  # this line doesn't work now
-  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
-  pkill -f python3
-  pkill -f /usr/bin/python3
+      echo "All GPU processes have been killed."
+  fi

+  # Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
+  # since we are in container anyway
+  pkill -9 -f python
+  pkill -9 -f python3

-  # wait until GPU memory usage smaller than 1GB
-  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+  # waiting for GPU processes to be fully killed
+  # loop while nvidia-smi returns any processes
+  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
    sleep 1
+    echo "Waiting for GPU processes to be killed"
  done

  # remove vllm config file
  rm -rf ~/.config/vllm

+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }

 upload_to_buildkite() {
@ -117,7 +110,7 @@ upload_to_buildkite() {
  fi

  # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

@ -169,7 +162,7 @@ run_latency_tests() {
        latency_command: $latency,
        gpu_type: $gpu
      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
    eval "$latency_command"
@ -179,6 +172,7 @@ run_latency_tests() {
  done
 }

+
 run_throughput_tests() {
  # run throughput tests using `benchmark_throughput.py`
  # $1: a json file specifying throughput test cases
@ -226,7 +220,7 @@ run_throughput_tests() {
        throughput_command: $command,
        gpu_type: $gpu
      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
    eval "$throughput_command"
@ -258,6 +252,7 @@ run_serving_tests() {
      continue
    fi

+
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.server_parameters')
    client_params=$(echo "$params" | jq -r '.client_parameters')
@ -335,7 +330,7 @@ run_serving_tests() {
          client_command: $client,
          gpu_type: $gpu
        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"

    done

@ -352,7 +347,6 @@ main() {
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
-  (which lsof) || (apt-get update && apt-get install -y lsof)

  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
@ -361,7 +355,7 @@ main() {

  # prepare for benchmarking
  cd benchmarks || exit 1
-  ensure_sharegpt_downloaded
+  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
@ -371,6 +365,7 @@ main() {
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json

+
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    if [[ $gpu_count -gt 0 ]]; then
+        echo "GPU found."
+    else
+        echo "Need at least 1 GPU to run benchmarking."
+        exit 1
+    fi
+    declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+    echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+    # check if HF_TOKEN is available and valid
+    if [[ -z "$HF_TOKEN" ]]; then
+        echo "Error: HF_TOKEN is not set."
+        exit 1
+    elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+        echo "Error: HF_TOKEN does not start with 'hf_'."
+        exit 1
+    else
+        echo "HF_TOKEN is set and valid."
+    fi
+}
+
+main() {
+
+    check_gpus
+    check_hf_token
+
+    df -h
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+    
+
+    # run lmdeploy
+    if which lmdeploy >/dev/null; then
+        echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+        exit 0
+    fi
+
+    # run tgi
+    if [ -e /tgi-entrypoint.sh ]; then
+        echo "tgi is available, redirect to run-tgi-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+        exit 0
+    fi
+
+    # run trt
+    if which trtllm-build >/dev/null; then
+        echo "trtllm is available, redirect to run-trt-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+        exit 0
+    fi
+
+    # run vllm
+    if [ -e /vllm-workspace ]; then
+        echo "vllm is available, redirect to run-vllm-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+        exit 0
+    fi
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -174,8 +174,8 @@ if __name__ == "__main__":
    # document the result
    with open(results_folder / "benchmark_results.md", "w") as f:

-        results = read_markdown("../.buildkite/nightly-benchmarks/" +
-                                "performance-benchmarks-descriptions.md")
+        results = read_markdown(
+            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -1,95 +0,0 @@
-import argparse
-import json
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-from tabulate import tabulate
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description=
-        'Parse command line arguments for summary-nightly-results script.')
-    parser.add_argument('--results-folder',
-                        type=str,
-                        required=True,
-                        help='The folder where the results are stored.')
-    parser.add_argument('--description',
-                        type=str,
-                        required=True,
-                        help='Description of the results.')
-
-    args = parser.parse_args()
-    return args
-
-
-def get_perf(df, method, model, metric):
-
-    means = []
-
-    for qps in [2, 4, 8, 16, "inf"]:
-        target = df['Test name'].str.contains(model)
-        target = target & df['Engine'].str.contains(method)
-        target = target & df['Test name'].str.contains("qps_" + str(qps))
-        filtered_df = df[target]
-
-        if filtered_df.empty:
-            means.append(0.)
-        else:
-            means.append(filtered_df[metric].values[0])
-
-    return np.array(means)
-
-
-def get_perf_w_std(df, method, model, metric):
-
-    if metric in ["TTFT", "ITL"]:
-        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
-        mean = mean.tolist()
-        std = get_perf(df, method, model, "Std " + metric + " (ms)")
-        if std.mean() == 0:
-            std = None
-        success = get_perf(df, method, model, "Successful req.")
-        if std is not None:
-            std = std / np.sqrt(success)
-            std = std.tolist()
-
-    else:
-        assert metric == "Tput"
-        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)")
-        mean = mean.tolist()
-        std = None
-
-    return mean, std
-
-
-def main(args):
-    results_folder = Path(args.results_folder)
-
-    results = []
-
-    # collect results
-    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
-            results = results + json.loads(f.read())
-
-    # generate markdown table
-    df = pd.DataFrame.from_dict(results)
-
-    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
-
-    with open(args.description, "r") as f:
-        description = f.read()
-
-    description = description.format(
-        nightly_results_benchmarking_table=md_table)
-
-    with open("nightly_results.md", "w") as f:
-        f.write(description)
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@ -1,241 +0,0 @@
-#!/bin/bash
-
-# Currently FP8 benchmark is NOT enabled.
-
-set -x
-server_params=$1
-common_params=$2
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-launch_trt_server() {
-
-  model_path=$(echo "$common_params" | jq -r '.model')
-  model_name="${model_path#*/}"
-  model_type=$(echo "$server_params" | jq -r '.model_type')
-  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
-  model_tp_size=$(echo "$common_params" | jq -r '.tp')
-  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
-  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
-  max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
-  max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
-  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
-
-  # create model caching directory
-  cd ~
-  rm -rf models
-  mkdir -p models
-  cd models
-  models_dir=$(pwd)
-  trt_model_path=${models_dir}/${model_name}-trt-ckpt
-  trt_engine_path=${models_dir}/${model_name}-trt-engine
-
-  # clone tensorrt backend
-  cd /
-  rm -rf tensorrtllm_backend
-  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
-  git lfs install
-  cd tensorrtllm_backend
-  git checkout $trt_llm_version
-  tensorrtllm_backend_dir=$(pwd)
-  git submodule update --init --recursive
-
-  # build trtllm engine
-  cd /tensorrtllm_backend
-  cd ./tensorrt_llm/examples/${model_type}
-  python3 convert_checkpoint.py \
-    --model_dir ${model_path} \
-    --dtype ${model_dtype} \
-    --tp_size ${model_tp_size} \
-    --output_dir ${trt_model_path}
-  trtllm-build \
-    --checkpoint_dir ${trt_model_path} \
-    --use_fused_mlp \
-    --reduce_fusion disable \
-    --workers 8 \
-    --gpt_attention_plugin ${model_dtype} \
-    --gemm_plugin ${model_dtype} \
-    --tp_size ${model_tp_size} \
-    --max_batch_size ${max_batch_size} \
-    --max_input_len ${max_input_len} \
-    --max_seq_len ${max_seq_len} \
-    --max_num_tokens ${max_num_tokens} \
-    --output_dir ${trt_engine_path}
-
-  # handle triton protobuf files and launch triton server
-  cd /tensorrtllm_backend
-  mkdir triton_model_repo
-  cp -r all_models/inflight_batcher_llm/* triton_model_repo/
-  cd triton_model_repo
-  rm -rf ./tensorrt_llm/1/*
-  cp -r ${trt_engine_path}/* ./tensorrt_llm/1
-  python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
-  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5
-  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false
-  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size
-  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1
-  cd /tensorrtllm_backend
-  python3 scripts/launch_triton_server.py \
-    --world_size=${model_tp_size} \
-    --model_repo=/tensorrtllm_backend/triton_model_repo &
-
-}
-
-launch_tgi_server() {
-  model=$(echo "$common_params" | jq -r '.model')
-  tp=$(echo "$common_params" | jq -r '.tp')
-  dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-  dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-  port=$(echo "$common_params" | jq -r '.port')
-  num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-  server_args=$(json2args "$server_params")
-
-  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
-    echo "Key 'fp8' exists in common params."
-    server_command="/tgi-entrypoint.sh \
-                --model-id $model \
-                --num-shard $tp \
-                --port $port \
-                --quantize fp8 \
-                $server_args"
-  else
-    echo "Key 'fp8' does not exist in common params."
-    server_command="/tgi-entrypoint.sh \
-                --model-id $model \
-                --num-shard $tp \
-                --port $port \
-                $server_args"
-  fi
-
-  echo "Server command: $server_command"
-  eval "$server_command" &
-
-}
-
-launch_lmdeploy_server() {
-  model=$(echo "$common_params" | jq -r '.model')
-  tp=$(echo "$common_params" | jq -r '.tp')
-  dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-  dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-  port=$(echo "$common_params" | jq -r '.port')
-  num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-  server_args=$(json2args "$server_params")
-
-  server_command="lmdeploy serve api_server $model \
-    --tp $tp \
-    --server-port $port \
-    $server_args"
-
-  # run the server
-  echo "Server command: $server_command"
-  bash -c "$server_command" &
-}
-
-launch_sglang_server() {
-
-  model=$(echo "$common_params" | jq -r '.model')
-  tp=$(echo "$common_params" | jq -r '.tp')
-  dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-  dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-  port=$(echo "$common_params" | jq -r '.port')
-  num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-  server_args=$(json2args "$server_params")
-
-  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
-    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
-    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="python3 \
-        -m sglang.launch_server \
-        --tp $tp \
-        --model-path $model \
-        --port $port \
-        $server_args"
-  else
-    echo "Key 'fp8' does not exist in common params."
-    server_command="python3 \
-        -m sglang.launch_server \
-        --tp $tp \
-        --model-path $model \
-        --port $port \
-        $server_args"
-  fi
-
-  # run the server
-  echo "Server command: $server_command"
-  eval "$server_command" &
-}
-
-launch_vllm_server() {
-
-  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
-
-  model=$(echo "$common_params" | jq -r '.model')
-  tp=$(echo "$common_params" | jq -r '.tp')
-  dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-  dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-  port=$(echo "$common_params" | jq -r '.port')
-  num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-  server_args=$(json2args "$server_params")
-
-  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
-    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
-    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
-        -tp $tp \
-        --model $model \
-        --port $port \
-        $server_args"
-  else
-    echo "Key 'fp8' does not exist in common params."
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
-        -tp $tp \
-        --model $model \
-        --port $port \
-        $server_args"
-  fi
-
-  # run the server
-  echo "Server command: $server_command"
-  eval "$server_command" &
-}
-
-main() {
-
-  if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then
-    launch_trt_server
-  fi
-
-  if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then
-    launch_tgi_server
-  fi
-
-  if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then
-    launch_lmdeploy_server
-  fi
-
-  if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then
-    launch_sglang_server
-  fi
-
-  if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
-    launch_vllm_server
-  fi
-}
-
-main
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@ -0,0 +1,102 @@
+#!/bin/bash
+
+
+server_params=$1
+common_params=$2
+
+
+
+model_path=$(echo "$common_params" | jq -r '.model')
+model_name="${model_path#*/}"
+model_type=$(echo "$server_params" | jq -r '.model_type')
+model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
+model_tp_size=$(echo "$common_params" | jq -r '.tp')
+max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
+trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
+
+cd ~
+rm -rf models
+mkdir -p models
+cd models
+models_dir=$(pwd)
+trt_model_path=${models_dir}/${model_name}-trt-ckpt
+trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+cd ~
+rm -rf tensorrt-demo
+git clone https://github.com/neuralmagic/tensorrt-demo.git
+cd tensorrt-demo
+tensorrt_demo_dir=$(pwd)
+
+# make sure the parameter inside tensorrt_demo is consistent to envvar
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
+
+
+cd /
+rm -rf tensorrtllm_backend
+git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+git lfs install
+cd tensorrtllm_backend
+git checkout $trt_llm_version
+tensorrtllm_backend_dir=$(pwd)
+git submodule update --init --recursive
+cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
+
+cd /tensorrtllm_backend
+cd ./tensorrt_llm/examples/${model_type}
+
+
+if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+
+    echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
+    echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
+    python ../quantization/quantize.py \
+        --model_dir ${model_path} \
+        --dtype ${model_dtype} \
+        --tp_size ${model_tp_size} \
+        --output_dir ${trt_model_path} \
+        --qformat fp8 \
+        --kv_cache_dtype fp8 \
+        --calib_size 2
+
+else
+
+    echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
+    python3 convert_checkpoint.py \
+        --model_dir ${model_path} \
+        --dtype ${model_dtype} \
+        --tp_size ${model_tp_size} \
+        --output_dir ${trt_model_path}
+
+fi
+
+
+
+trtllm-build \
+--checkpoint_dir=${trt_model_path} \
+--gpt_attention_plugin=${model_dtype} \
+--gemm_plugin=${model_dtype} \
+--remove_input_padding=enable \
+--paged_kv_cache=enable \
+--tp_size=${model_tp_size} \
+--max_batch_size=${max_batch_size} \
+--max_input_len=${max_input_len} \
+--max_output_len=${max_output_len} \
+--max_num_tokens=${max_output_len} \
+--opt_num_tokens=${max_output_len} \
+--output_dir=${trt_engine_path} 
+
+cd /tensorrtllm_backend/triton_model_repo
+rm -rf ./tensorrt_llm/1/*
+cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+cd /tensorrtllm_backend
+python3 scripts/launch_triton_server.py \
+--world_size=${model_tp_size} \
+--model_repo=/tensorrtllm_backend/triton_model_repo &
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@ -8,7 +8,6 @@ main() {

    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
-    (which zip) || (apt-get install -y zip)

    if [ ! -f /workspace/buildkite-agent ]; then
        echo "buildkite-agent binary not found. Skip plotting the results."
@ -25,54 +24,17 @@ main() {
    ls
    ls results/

-    # upload benchmark results
-    zip -r results.zip results/
-    /workspace/buildkite-agent artifact upload "results.zip"
-
-    # upload benchmarking scripts
-    cd $VLLM_SOURCE_CODE_LOC/
-    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
-    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
-
-    cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
-    # upload benchmarking pipeline
-    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
-
-    cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
-    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
+    # generate figures
+    python3 -m pip install tabulate pandas matplotlib
+    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+        --description $description \
+        --results-folder results/
    
-
-
-    # The figures should be genereated by a separate process outside the CI/CD pipeline
-
-    # # generate figures
-    # python3 -m pip install tabulate pandas matplotlib
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
-    #     --description $description \
-    #     --results-folder results/ 
-
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sharegpt
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_2048_128
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_128_2048
-    
-    # # upload results and figures
-    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
-    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+    # upload results and figures
+    /workspace/buildkite-agent artifact upload "nightly_results.png"
+    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }

 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@ -0,0 +1,135 @@
+import argparse
+import json
+import math
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description=
+        'Parse command line arguments for summary-nightly-results script.')
+    parser.add_argument('--results-folder',
+                        type=str,
+                        required=True,
+                        help='The folder where the results are stored.')
+    parser.add_argument('--description',
+                        type=str,
+                        required=True,
+                        help='Description of the results.')
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
+    results_folder = Path(args.results_folder)
+
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*_nightly_results.json"):
+        with open(test_file, "r") as f:
+            results = results + json.loads(f.read())
+
+    # generate markdown table
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+
+    with open(args.description, "r") as f:
+        description = f.read()
+
+    description = description.format(
+        nightly_results_benchmarking_table=md_table)
+
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+
+    plt.rcParams.update({'font.size': 20})
+
+    # plot results
+    fig, axes = plt.subplots(3, 3, figsize=(16, 14))
+    fig.subplots_adjust(hspace=1)
+    methods = ["vllm", "trt", "lmdeploy", "tgi"]
+    for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
+        for j, metric in enumerate(["TTFT", "ITL"]):
+            means, stds = [], []
+            for method in methods:
+                target = df['Test name'].str.contains(model)
+                target = target & df['Engine'].str.contains(method)
+                filtered_df = df[target]
+
+                if filtered_df.empty:
+                    means.append(0.)
+                    stds.append(0.)
+                else:
+                    means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
+                    std = filtered_df[f"Std {metric} (ms)"].values[0]
+                    success = filtered_df["Successful req."].values[0]
+                    stds.append(std / math.sqrt(success))
+
+            print(model, metric)
+            print(means, stds)
+
+            ax = axes[i, j + 1]
+
+            bars = ax.bar(
+                ["vllm", "trt", "lmdeploy", "tgi"],
+                means,
+                yerr=stds,
+                capsize=10,
+            )
+            for idx, bar in enumerate(bars):
+                bar.set_color(bar_colors[idx])
+            ax.set_ylim(bottom=0)
+
+            ax.set_ylabel(f"{metric} (ms)")
+            ax.set_title(f"{model} {metric}")
+            ax.grid(axis='y')
+
+        metric = "Tput"
+        j = 0
+        if True:
+            tputs = []
+            for method in methods:
+                target = df['Test name'].str.contains(model)
+                target = target & df['Engine'].str.contains(method)
+                filtered_df = df[target]
+
+                if filtered_df.empty:
+                    tputs.append(0.)
+                else:
+                    input_tput = filtered_df["Input Tput (tok/s)"].values[0]
+                    output_tput = filtered_df["Output Tput (tok/s)"].values[0]
+                    tputs.append(input_tput + output_tput)
+
+            print(model, metric)
+            print(tputs)
+
+            ax = axes[i, j]
+
+            bars = ax.bar(
+                ["vllm", "trt", "lmdeploy", "tgi"],
+                tputs,
+            )
+            for idx, bar in enumerate(bars):
+                bar.set_color(bar_colors[idx])
+
+            ax.set_ylim(bottom=0)
+
+            ax.set_ylabel("Tput (token/s)")
+            ax.set_title(f"{model} {metric}")
+            ax.grid(axis='y')
+
+    fig.tight_layout()
+    fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@ -0,0 +1,218 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill lmdeploy || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # append lmdeploy to the test name
+    test_name=lmdeploy_$test_name
+    
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
+    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    # prepare tokenizer
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+
+    server_command="lmdeploy serve api_server $model \
+      --tp $tp \
+      --server-port $port \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    bash -c "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "lmdeploy server is up and running."
+    else
+      echo ""
+      echo "lmdeploy failed to start within the timeout period."
+      break
+    fi
+
+    # get model name
+    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend lmdeploy \
+        --tokenizer /tokenizer_cache \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        --model \"$model_name\" \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "lmdeploy" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  python -m pip install transformers==4.41.2
+
+  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -1,357 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -x
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-  # check if HF_TOKEN is available and valid
-  if [[ -z "$HF_TOKEN" ]]; then
-    echo "Error: HF_TOKEN is not set."
-    exit 1
-  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-    echo "Error: HF_TOKEN does not start with 'hf_'."
-    exit 1
-  else
-    echo "HF_TOKEN is set and valid."
-  fi
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-get_current_llm_serving_engine() {
-
-  if which lmdeploy >/dev/null; then
-    echo "Container: lmdeploy"
-    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
-    return
-  fi
-
-  if [ -e /tgi-entrypoint.sh ]; then
-    echo "Container: tgi"
-    export CURRENT_LLM_SERVING_ENGINE=tgi
-    return
-  fi
-
-  if which trtllm-build >/dev/null; then
-    echo "Container: tensorrt-llm"
-    export CURRENT_LLM_SERVING_ENGINE=trt
-    return
-  fi
-
-  if [ -e /sgl-workspace ]; then
-    echo "Container: sglang"
-    export CURRENT_LLM_SERVING_ENGINE=sglang
-    return
-  fi
-
-  if [ -e /vllm-workspace ]; then
-    echo "Container: vllm"
-    # move to a completely irrelevant directory, to avoid import vllm from current folder
-    export CURRENT_LLM_SERVING_ENGINE=vllm
-    
-    return
-  fi
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-kill_gpu_processes() {
-  pkill -f python
-  pkill -f python3
-  pkill -f tritonserver
-  pkill -f pt_main_thread
-  pkill -f text-generation
-  pkill -f lmdeploy
-
-  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
-    sleep 1
-  done
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-ensure_installed() {
-  # Ensure that the given command is installed by apt-get
-  local cmd=$1
-  if ! which $cmd >/dev/null; then
-    apt-get update && apt-get install -y $cmd
-  fi
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
-        "$server_params" "$common_params"
-    fi
-
-    wait_for_server
-
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # prepare tokenizer
-    # this is required for lmdeploy.
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-
-
-    # change model name for lmdeploy (it will not follow standard hf name)
-    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
-      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      backend=$CURRENT_LLM_SERVING_ENGINE
-
-      if [[ $backend = "trt" ]]; then
-        backend="tensorrt-llm"
-      fi
-
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-
-      if [[ "$dataset_name" = "sharegpt" ]]; then
-
-        client_command="python3 benchmark_serving.py \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      elif [[ "$dataset_name" = "sonnet" ]]; then
-
-        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
-        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
-        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
-
-        client_command="python3 benchmark_serving.py \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --sonnet-input-len $sonnet_input_len \
-          --sonnet-output-len $sonnet_output_len \
-          --sonnet-prefix-len $sonnet_prefix_len \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      else
-  
-        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
-        exit 1
-
-      fi
-
-        
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      server_command="None"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-  done
-
-  kill_gpu_processes
-}
-
-
-prepare_dataset() {
-
-  # download sharegpt dataset
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-  echo "" > sonnet_4x.txt
-  for _ in {1..4}
-  do
-    cat sonnet.txt >> sonnet_4x.txt
-  done
-  
-}
-
-main() {
-
-  # check if the environment variable is successfully injected from yaml
-
-  check_gpus
-  check_hf_token
-  get_current_llm_serving_engine
-
-  pip install -U transformers
-
-  # check storage
-  df -h
-
-  ensure_installed wget
-  ensure_installed curl
-  ensure_installed jq
-
-  prepare_dataset
-
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
-
-  # run the test
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-
-  # upload benchmark results to buildkite
-  python3 -m pip install tabulate pandas
-  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@ -0,0 +1,216 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill text-generation || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/generate_stream > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append tgi to the test name
+    test_name=tgi_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
+    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        --quantize fp8 \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        $server_args"
+    fi
+
+
+    
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "tgi server is up and running."
+    else
+      echo ""
+      echo "tgi failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tgi \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "tgi" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  export CURRENT_LLM_SERVING_ENGINE=tgi
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@ -0,0 +1,214 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill tritonserver || true
+  # waiting for GPU processes to be fully killed
+  sleep 20
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/generate_stream > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append trt to the test name
+    test_name=trt_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
+    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+
+
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+
+    echo "Running test case $test_name"
+    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "trt server is up and running."
+    else
+      echo ""
+      echo "trt failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tensorrt-llm \
+        --tokenizer /tokenizer_cache \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command=""
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "trt" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+  check_gpus
+
+
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  # update transformers package, to make sure mixtral tokenizer is available
+  python -m pip install transformers -U
+
+  export CURRENT_LLM_SERVING_ENGINE=trt
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@ -0,0 +1,221 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append vllm to the test name
+    test_name=vllm_$test_name
+
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
+    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    fi
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend vllm \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "vllm" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  export CURRENT_LLM_SERVING_ENGINE=vllm
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+
+  python3 -m pip install tabulate pandas
+  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -17,17 +17,10 @@ serving_column_mapping = {
    "request_throughput": "Tput (req/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "std_ttft_ms": "Std TTFT (ms)",
-    "median_ttft_ms": "Median TTFT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "std_itl_ms": "Std ITL (ms)",
-    "median_itl_ms": "Median ITL (ms)",
-    "mean_tpot_ms": "Mean TPOT (ms)",
-    "std_tpot_ms": "Std TPOT (ms)",
-    "median_tpot_ms": "Median TPOT (ms)",
-    "total_token_throughput": "Total Token Tput (tok/s)",
+    "input_throughput": "Input Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
-    "total_input_tokens": "Total input tokens",
-    "total_output_tokens": "Total output tokens",
    "engine": "Engine",
 }

--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -2,11 +2,9 @@
 TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
 URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"

-TIMEOUT_SECONDS=10
-
 retries=0
 while [ $retries -lt 1000 ]; do
-    if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+    if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
        exit 0
    fi

--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@ -1,42 +1,47 @@

 ## Latency tests

+This test suite aims to test vllm's end-to-end latency under a controlled setup.
+
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).

+### Latency benchmarking results

 {latency_tests_markdown_table}

-
 ## Throughput tests

+This test suite aims to test vllm's throughput.
+
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.

+### Throughput benchmarking results

 {throughput_tests_markdown_table}

-
 ## Serving tests

+This test suite aims to test vllm's real serving metrics.
+
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B, under QPS 2
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).

+### Serving benchmarking results

 {serving_tests_markdown_table}

-
 ## json version of the benchmarking tables

 This section contains the data of the markdown tables above in JSON format. 
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@ -2,7 +2,7 @@
    {
        "test_name": "latency_llama8B_tp1",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
@ -12,7 +12,7 @@
    {
        "test_name": "latency_llama70B_tp4",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num-iters-warmup": 5,
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@ -1,18 +1,16 @@
 [
    {
-        "test_name": "llama8B_tp1_sharegpt",
-        "qps_list": [4,8,16,32,"inf"],
+        "test_name": "llama8B_tp1",
+        "qps_list": [4],
        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tp": 1,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000,
-            "reuse_server": false
+            "port": 8000
        },
        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -23,158 +21,34 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
+            "model_dtype": "float16",
+            "max_batch_size": 256,
            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
+        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
+            "disable_log_requests": ""
        },
        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "enable_torch_compile": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
        }
    },
    {
-        "test_name": "llama8B_tp1_sonnet_512_16",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-            "tp": 1,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
-            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 16,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "enable_torch_compile": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama8B_tp1_sonnet_512_256",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-            "tp": 1,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
-            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 256,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "enable_torch_compile": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama70B_tp4_sharegpt",
-        "qps_list": [4,8,16,32,"inf"],
+        "test_name": "llama70B_tp4",
+        "qps_list": [2],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000,
-            "reuse_server": false
+            "port": 8000
        },
        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -185,50 +59,34 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
+            "model_dtype": "float16",
+            "max_batch_size": 256,
            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
+        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
+            "disable_log_requests": ""
        },
        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
        }
    },
    {
-        "test_name": "llama70B_tp4_sonnet_512_16",
-        "qps_list": [4,8,16,32,"inf"],
+        "test_name": "mixtral8x7B_tp2",
+        "qps_list": [2],
        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tp": 4,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tp": 2,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 16,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
+            "port": 8000
        },
        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
@ -239,85 +97,20 @@
        },
        "trt_server_parameters": {
            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
+            "model_dtype": "float16",
+            "max_batch_size": 256,
            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
+        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
+            "disable_log_requests": ""
        },
        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama70B_tp4_sonnet_512_256",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tp": 4,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
-            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 256,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        }, 
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -3,7 +3,7 @@
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
@ -11,7 +11,7 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -22,7 +22,7 @@
        "test_name": "serving_llama70B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
@ -30,7 +30,7 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -55,26 +55,5 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
-    },
-    {
-        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
-        "qps_list": [2],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "disable_log_requests": "", 
-            "tensor_parallel_size": 4,
-            "swap_space": 16, 
-            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
-            "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1,
-            "use_v2_block_manager": ""
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200 
-        }
    }
-]
+]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@ -2,7 +2,7 @@
    {
        "test_name": "throughput_llama8B_tp1",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -13,7 +13,7 @@
    {
        "test_name": "throughput_llama70B_tp4",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,28 +1,9 @@
 steps:
-  - label: "Build wheel - CUDA 12.1"
+  - label: "Build wheel - CUDA {{matrix.cuda_version}}"
    agents:
      queue: cpu_queue
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-      - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-      - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - block: "Build CUDA 11.8 wheel"
-    key: block-build-cu118-wheel
-  
-  - label: "Build wheel - CUDA 11.8"
-    depends_on: block-build-cu118-wheel
-    agents:
-      queue: cpu_queue
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      # rename the files to change linux -> manylinux1
@ -31,3 +12,8 @@ steps:
      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
    env:
      DOCKER_BUILDKIT: "1"
+    matrix:
+      setup:
+        cuda_version:
+          - "11.8.0"
+          - "12.1.0"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -1,5 +1,5 @@
 # This script runs test inside the corresponding ROCm docker container.
-set -o pipefail
+set -ex

 # Print ROCm version
 echo "--- Confirming Clean Initial State"
@ -55,7 +55,8 @@ while true; do
 done

 echo "--- Pulling container" 
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
+image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull ${image_name}

@ -70,85 +71,15 @@ HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p ${HF_CACHE}
 HF_MOUNT="/root/.cache/huggingface"

-commands=$@
-echo "Commands:$commands"
-#ignore certain kernels tests
-if [[ $commands == *" kernels "* ]]; then
-  commands="${commands} \
-  --ignore=kernels/test_attention.py \
-  --ignore=kernels/test_attention_selector.py \
-  --ignore=kernels/test_blocksparse_attention.py \
-  --ignore=kernels/test_causal_conv1d.py \
-  --ignore=kernels/test_cutlass.py \
-  --ignore=kernels/test_encoder_decoder_attn.py \
-  --ignore=kernels/test_flash_attn.py \
-  --ignore=kernels/test_flashinfer.py \
-  --ignore=kernels/test_gguf.py \
-  --ignore=kernels/test_int8_quant.py \
-  --ignore=kernels/test_machete_gemm.py \
-  --ignore=kernels/test_mamba_ssm.py \
-  --ignore=kernels/test_marlin_gemm.py \
-  --ignore=kernels/test_moe.py \
-  --ignore=kernels/test_prefix_prefill.py \
-  --ignore=kernels/test_rand.py \
-  --ignore=kernels/test_sampler.py"
-fi
-
-#ignore certain Entrypoints tests
-if [[ $commands == *" entrypoints/openai "* ]]; then
-  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
-  --ignore=entrypoints/openai/test_accuracy.py \
-  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_encoder_decoder.py \
-  --ignore=entrypoints/openai/test_embedding.py \
-  --ignore=entrypoints/openai/test_oot_registration.py "}
-fi
-
-PARALLEL_JOB_COUNT=8
-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
-if [[ $commands == *"--shard-id="* ]]; then
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
-    #replace shard arguments
-    commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
-    commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
-    echo "Shard ${GPU} commands:$commands"
-    docker run \
+docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
        --shm-size=16gb \
        --rm \
-        -e HIP_VISIBLE_DEVICES=${GPU} \
        -e HF_TOKEN \
        -v ${HF_CACHE}:${HF_MOUNT} \
        -e HF_HOME=${HF_MOUNT} \
-        --name ${container_name}_${GPU}  \
+        --name ${container_name} \
        ${image_name} \
-        /bin/bash -c "${commands}" \
-        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
-    PIDS+=($!)
-  done
-  #wait for all processes to finish and collect exit codes
-  for pid in ${PIDS[@]}; do
-    wait ${pid}
-    STATUS+=($?)
-  done
-  for st in ${STATUS[@]}; do
-    if [[ ${st} -ne 0 ]]; then
-      echo "One of the processes failed with $st"
-      exit ${st}
-    fi
-  done
-else
-  docker run \
-          --device /dev/kfd --device /dev/dri \
-          --network host \
-          --shm-size=16gb \
-          --rm \
-          -e HIP_VISIBLE_DEVICES=0 \
-          -e HF_TOKEN \
-          -v ${HF_CACHE}:${HF_MOUNT} \
-          -e HF_HOME=${HF_MOUNT} \
-          --name ${container_name} \
-          ${image_name} \
-          /bin/bash -c "${commands}"
-fi
+        /bin/bash -c "${@}"
+
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@ -1,39 +0,0 @@
-# This script build the CPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image, setting --shm-size=4g for tensor parallel.
-source /etc/environment
-#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
-
-# Run basic model test
-docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" \
-    --ignore=tests/models/test_embedding.py \
-    --ignore=tests/models/test_oot_registration.py \
-    --ignore=tests/models/test_registry.py \
-    --ignore=tests/models/test_jamba.py \
-    --ignore=tests/models/test_mamba.py \
-    --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
-
-# online inference
-docker exec cpu-test bash -c "
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
-  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-  python3 benchmarks/benchmark_serving.py \
-    --backend vllm \
-    --dataset-name random \
-    --model facebook/opt-125m \
-    --num-prompts 20 \
-    --endpoint /v1/completions \
-    --tokenizer facebook/opt-125m"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -22,25 +22,8 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"

 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
-  pytest -v -s tests/models/encoder_decoder/language
-  pytest -v -s tests/models/decoder_only/language \
-    --ignore=tests/models/test_fp8.py \
-    --ignore=tests/models/decoder_only/language/test_jamba.py \
-    --ignore=tests/models/decoder_only/language/test_mamba.py \
-    --ignore=tests/models/decoder_only/language/test_granitemoe.py \
-    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
-
-# Run compressed-tensor test
-docker exec cpu-test bash -c "
-  pytest -s -v \
-  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
-  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
-
-# Run AWQ test
-docker exec cpu-test bash -c "
-  pytest -s -v \
-  tests/quantization/test_ipex_quant.py"
+  pip install pytest Pillow protobuf
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported

 # online inference
 docker exec cpu-test bash -c "
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -12,4 +12,5 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
+    python3 /workspace/vllm/examples/offline_inference_tpu.py
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -11,4 +11,4 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
+docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -5,498 +5,286 @@
 # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
 # to generate the final pipeline yaml file.

-# Documentation
-# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually)
-# command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, 
-#     in this case, commands must be specified. the first command runs on first host, the second
-#     command runs on the second host.
-# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
-
-# When adding a test
-# - If the test belong to an existing group, add it there
-# - If the test is short, add to any existing step
-# - If the test takes more than 10min, then it is okay to create a new step. 
-#   Note that all steps execute in parallel. 

 steps:
-##### fast check tests  #####
-
- label: Documentation Build # 2min
-  working_dir: "/vllm-workspace/test_docs/docs"
+- label: Async Engine, Inputs, Utils, Worker Test
  fast_check: true
-  no_gpu: True
+  fast_check_only: true
  commands:
-  - pip install -r requirements-docs.txt
-  - SPHINXOPTS=\"-W\" make html
-  # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/dev/sampling_params.html
-
- label: Async Engine, Inputs, Utils, Worker Test # 24min
-  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/mq_llm_engine
-  - tests/async_engine
-  - tests/test_inputs
-  - tests/multimodal
-  - tests/test_utils
-  - tests/worker
-  commands:
-  - pytest -v -s mq_llm_engine # MQLLMEngine
-  - pytest -v -s async_engine # AsyncLLMEngine
-  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
+  - pytest -v -s async_engine # Async Engine
  - pytest -v -s test_inputs.py
  - pytest -v -s multimodal
  - pytest -v -s test_utils.py # Utils
  - pytest -v -s worker # Worker

- label: Basic Correctness Test # 30min
-  #mirror_hardwares: [amd]
+- label: Metrics, Tracing Test
  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness/test_basic_correctness
-  - tests/basic_correctness/test_cpu_offload
-  - tests/basic_correctness/test_preemption
+  fast_check_only: true
  commands:
+  - pytest -v -s metrics # Metrics
+  - "pip install \
+      opentelemetry-sdk \
+      opentelemetry-api \
+      opentelemetry-exporter-otlp \
+      opentelemetry-semantic-conventions-ai" # Tracing
+  - pytest -v -s tracing
+
+- label: Regression Test
+  mirror_hardwares: [amd]
+  fast_check: true
+  command: pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: AsyncEngine Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s async_engine
+
+- label: Basic Correctness Test
+  mirror_hardwares: [amd]
+  fast_check: true
+  commands:
+  # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
- label: Chunked Prefill Test
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness/test_chunked_prefill
-  commands:
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Core Test # 10min
+- label: Core Test
  mirror_hardwares: [amd]
  fast_check: true
-  source_file_dependencies:
-  - vllm/core
-  - vllm/distributed
-  - tests/core
  commands:
  - pytest -v -s core
+  - pytest -v -s distributed/test_parallel_state.py

- label: Entrypoints Test # 40min
+- label: Distributed Comm Ops Test
+  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
+  num_gpus: 2
  commands:
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
-  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
-  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
-  - pytest -v -s entrypoints/test_chat_utils.py
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py

- label: Distributed Tests (4 GPUs) # 10min
+- label: 2 Node Tests (4 GPUs in total)
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+
+- label: Distributed Tests (2 GPUs)
+  mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  commands:
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
+
+- label: Distributed Tests (4 GPUs)
+  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  fast_check: true
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/core/
-  - tests/distributed
-  - tests/spec_decode/e2e/test_integration_dist_tp4
-  - tests/compile
  commands:
-  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
+  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
+  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py

- label: Metrics, Tracing Test # 10min
-  num_gpus: 2 
-  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/metrics
-  - tests/tracing
+- label: Pipeline Parallelism Test
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
  commands:
-  - pytest -v -s metrics 
-  - "pip install \
-      'opentelemetry-sdk>=1.26.0,<1.27.0' \
-      'opentelemetry-api>=1.26.0,<1.27.0' \
-      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
-      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
-  - pytest -v -s tracing
+  - pytest -v -s distributed/test_pipeline_parallel.py

-##### fast check tests  #####
-#####  1 GPU test  #####
-
- label: Regression Test # 5min
+- label: Engine Test
  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/test_regression
-  commands:
-  - pip install modelscope
-  - pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
-
- label: Engine Test # 10min
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/engine
-  - tests/tokenization
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization

- label: Examples Test # 15min
-  working_dir: "/vllm-workspace/examples"
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/entrypoints
-  - examples/
+- label: Entrypoints Test
+  fast_check: true
+  mirror_hardwares: [amd]
+
  commands:
-    - pip install awscli tensorizer # for llava example and tensorizer test
+  - pytest -v -s entrypoints/llm
+  - pytest -v -s entrypoints/openai
+
+- label: Examples Test
+  working_dir: "/vllm-workspace/examples"
+  mirror_hardwares: [amd]
+  commands:
+    # install aws cli for llava_example.py
+    # install tensorizer for tensorize_vllm_model.py
+    - pip install awscli tensorizer
    - python3 offline_inference.py
    - python3 cpu_offload.py
-    - python3 offline_inference_chat.py
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
-    - python3 offline_inference_vision_language.py
-    - python3 offline_inference_vision_language_multi_image.py
+    - python3 llava_example.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference_encoder_decoder.py
-    - python3 offline_profile.py --model facebook/opt-125m

- label: Prefix Caching Test # 9min
+- label: Inputs Test
  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/prefix_caching
  commands:
-    - pytest -v -s prefix_caching
+    - pytest -v -s test_inputs.py
+    - pytest -v -s multimodal

- label: Samplers Test # 36min
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - vllm/sampling_metadata.py
-  - tests/samplers
-  commands:
-    - pytest -v -s samplers
-    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-
- label: LogitsProcessor Test # 5min
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - tests/test_logits_processor
-  command: pytest -v -s test_logits_processor.py
-
- label: Speculative decoding tests # 30min
-  source_file_dependencies:
-  - vllm/spec_decode
-  - tests/spec_decode
-  commands:
-    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
-
- label: LoRA Test %N # 15min each
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/lora
-  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
-  parallelism: 4
-
- label: "PyTorch Fullgraph Smoke Test" # 9min
-  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/compile
-  commands:
-  - pytest -v -s compile/test_basic_correctness.py
-
-# TODO: re-write in comparison tests, and fix symbolic shape
-# for quantization ops.
-# - label: "PyTorch Fullgraph Test" # 18min
-#   source_file_dependencies:
-#   - vllm/
-#   - tests/compile
-#   commands:
-#   - pytest -v -s compile/test_full_graph.py
-
- label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - csrc/
-  - vllm/attention
-  - tests/kernels
+- label: Kernels Test %N
+  #mirror_hardwares: [amd]
  commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4

- label: Tensorizer Test # 11min
+- label: Models Test
+  #mirror_hardwares: [amd]
+  commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pytest -v -s models -m \"not vlm\"
+
+- label: Vision Language Models Test
  mirror_hardwares: [amd]
-  soft_fail: true
-  source_file_dependencies:
-  - vllm/model_executor/model_loader
-  - tests/tensorizer_loader
  commands:
-    - apt-get update && apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s tensorizer_loader
+    - pytest -v -s models -m vlm

- label: Benchmarks # 9min
-  working_dir: "/vllm-workspace/.buildkite"
+- label: Prefix Caching Test
  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - benchmarks/
  commands:
-  - pip install aiohttp
-  - bash run-benchmarks.sh
+    - pytest -v -s prefix_caching

- label: Quantization Test # 33min
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/quantization
-  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
-
- label: LM Eval Small Models # 53min
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-small.txt -t 1
-
- label: Encoder Decoder tests # 5min
-  source_file_dependencies:
-  - vllm/
-  - tests/encoder_decoder
-  commands:
-    - pytest -v -s encoder_decoder
-
- label: OpenAI-Compatible Tool Use # 20 min
-  fast_check: false
-  mirror_hardwares: [ amd ]
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
-
-#####  models test  #####
-
- label: Basic Models Test # 3min
-  source_file_dependencies:
-  - vllm/
-  - tests/models
-  commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
-    - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
-
- label: Decoder-only Language Models Test # 1h36min
+- label: Samplers Test
  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/models/decoder_only/language
-  commands:
-    - pytest -v -s models/decoder_only/language
+  command: pytest -v -s samplers

- label: Decoder-only Multi-Modal Models Test # 1h31min
+- label: LogitsProcessor Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s test_logits_processor.py
+
+- label: Utils Test
+  commands:
+    - pytest -v -s test_utils.py
+    - pytest -v -s test_embedded_commit.py
+
+- label: Worker Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s worker
+
+- label: Speculative decoding tests
  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
  commands:
-    - pytest -v -s models/decoder_only/audio_language
-    - pytest -v -s models/decoder_only/vision_language
+    # See https://github.com/vllm-project/vllm/issues/5152
+    - export VLLM_ATTENTION_BACKEND=XFORMERS
+    - pytest -v -s spec_decode

- label: Other Models Test # 6min
+- label: LoRA Test %N
  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/models/embedding/language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/language
-  - tests/models/encoder_decoder/vision_language
-  commands:
-    - pytest -v -s models/embedding/language
-    - pytest -v -s models/embedding/vision_language
-    - pytest -v -s models/encoder_decoder/language
-    - pytest -v -s models/encoder_decoder/vision_language
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+  parallelism: 4

-# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
-  optional: true
-  commands:
-    - echo 'Testing custom models...'
-    # PR authors can temporarily add commands below to test individual models
-    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
-    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
-
-#####  1 GPU test  #####
-#####  multi gpus test  #####
-
- label: Distributed Comm Ops Test # 7min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/distributed
-  - tests/distributed
-  commands:
-  - pytest -v -s distributed/test_comm_ops.py
-  - pytest -v -s distributed/test_shm_broadcast.py
-
- label: 2 Node Tests (4 GPUs in total) # 16min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  num_nodes: 2
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
-
- label: Distributed Tests (2 GPUs) # 40min
+- label: LoRA Long Context (Distributed)
  #mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  - vllm/compilation
-  commands:
-  - pytest -v -s ./compile/test_basic_correctness.py
-  - pytest -v -s ./compile/test_wrapper.py
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
-  # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-  - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
-
- label: Multi-step Tests (4 GPUs) # 36min
-  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
-  source_file_dependencies:
-  - vllm/model_executor/layers/sampler.py
-  - vllm/sequence.py
-  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/multi_step_worker.py
-  - vllm/worker/model_runner_base.py
-  - vllm/worker/model_runner.py
-  - vllm/worker/multi_step_model_runner.py
-  - vllm/engine
-  - tests/multi_step
-  commands:
-  - pytest -v -s multi_step/test_correctness_async_llm.py
-  - pytest -v -s multi_step/test_correctness_llm.py
-
- label: Pipeline Parallelism Test # 45min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  commands:
-  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
-
- label: LoRA Long Context (Distributed) # 11min
  # This test runs llama 13B, so it is required to run on 4 GPUs.
-  num_gpus: 4
-  soft_fail: true
-  source_file_dependencies:
-  - vllm/lora
-  - tests/lora/test_long_context
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s -x lora/test_long_context.py

- label: Weight Loading Multiple GPU Test  # 33min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/
-  - tests/weight_loading
+- label: Tensorizer Test
+  #mirror_hardwares: [amd]
+  soft_fail: true
+  fast_check: true
  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+    - apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader

- label: Weight Loading Multiple GPU Test - Large Models # optional
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  gpu: a100
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/weight_loading
+- label: Metrics Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s metrics
+
+- label: Quantization Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s quantization
+
+- label: Tracing Test
+  commands: 
+    - "pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai"
+    - pytest -v -s tracing
+
+- label: Benchmarks
+  working_dir: "/vllm-workspace/.buildkite"
+  mirror_hardwares: [amd]
  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
+  - pip install aiohttp
+  - bash run-benchmarks.sh

+- label: LM Eval Small Models
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1

-##### multi gpus test #####
-##### A100 test #####
-
- label: Distributed Tests (A100) # optional
+- label: LM Eval Large Models
+  gpu: a100
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+
+- label: Documentation Build
+  working_dir: "/vllm-workspace/test_docs/docs"
+  fast_check: true
+  no_gpu: True
+  commands:
+  - pip install -r requirements-docs.txt
+  - SPHINXOPTS=\"-W\" make html
+
+- label: Distributed Tests (A100)
  gpu: a100
  num_gpus: 4
-  source_file_dependencies:
-  - vllm/
  commands: 
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
-  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s -x lora/test_mixtral.py
-
- label: LM Eval Large Models # optional
-  gpu: a100
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-large.txt -t 4
--- a/.dockerignore
+++ b/.dockerignore
@ -1,33 +1 @@
-/.venv
-/build
-dist
 vllm/*.so
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-.mypy_cache
-
-# Distribution / packaging
-.Python
-/build/
-cmake-build-*/
-CMakeUserPresets.json
-develop-eggs/
-/dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -1,30 +0,0 @@
-# See https://help.github.com/articles/about-codeowners/
-# for more info about CODEOWNERS file
-
-# This lists cover the "core" components of vLLM that require careful review
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-CMakeLists.txt @tlrmchlsmth @WoosukKwon
-
-# Test ownership
-/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
-/tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
-/tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
-/tests/prefix_caching @comaniac @KuntaiDu
-/tests/spec_decode @njhill @LiuXiaoxuanPKU
-/tests/kernels @tlrmchlsmth @WoosukKwon
-/tests/quantization @mgoin @robertgshaw2-neuralmagic
-/.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/distributed/test_multi_node_assignment.py @youkaichao
-/tests/distributed/test_pipeline_parallel.py @youkaichao
-/tests/distributed/test_same_node.py @youkaichao
-/tests/multi_step @alexm-neuralmagic @comaniac
-/tests/weight_loading @mgoin @youkaichao
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@ -20,10 +20,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -38,10 +38,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -36,10 +36,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/400-bug
+++ b/.github/ISSUE_TEMPLATE/400-bug
@ -20,25 +20,11 @@ body:
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
-      <details>
-      <summary>The output of `python collect_env.py`</summary>
-
      ```text
-      Your output of `python collect_env.py` here
+      The output of `python collect_env.py`
      ```
-      
-      </details>
  validations:
    required: true
- type: textarea
-  attributes:
-    label: Model Input Dumps
-    description: |
-      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
-    placeholder: |
-      Upload the dumped input file.
-  validations:
-    required: false
 - type: textarea
  attributes:
    label: 🐛 Describe the bug
@ -98,10 +84,3 @@ body:
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.

      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/500-feature
+++ b/.github/ISSUE_TEMPLATE/500-feature
@ -29,10 +29,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/600-new
+++ b/.github/ISSUE_TEMPLATE/600-new
@ -31,10 +31,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/700-performance
+++ b/.github/ISSUE_TEMPLATE/700-performance
@ -50,10 +50,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -47,10 +47,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/800-misc
+++ b/.github/ISSUE_TEMPLATE/800-misc
@ -19,10 +19,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -39,16 +39,6 @@ FIX #xxxx (*link existing issues this PR will resolve*)
    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
 </ul>

-<h3>Adding or changing kernels</h3>
-<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
-<ul>
-    <li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
-    <li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
-    <li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops.  See <code>tests/kernels</code> for examples.</li>
-    <li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
-    <li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
-</ul>
-
 <h3>Notes for Large Changes</h3>
 <p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>

--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,7 +0,0 @@
-version: 2
-updates:
-  # Maintain dependencies for GitHub Actions
-  - package-ecosystem: "github-actions"
-    directory: "/"
-    schedule:
-      interval: "weekly"
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@ -1,37 +0,0 @@
-name: Lint GitHub Actions workflows
-on:
-  push:
-    branches:
-      - "main"
-    paths:
-      - '.github/workflows/*.ya?ml'
-      - '.github/workflows/actionlint.*'
-  pull_request:
-    branches:
-      - "main"
-    paths:
-      - '.github/workflows/*.ya?ml'
-      - '.github/workflows/actionlint.*'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  actionlint:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-        with:
-          fetch-depth: 0
-
-      - name: "Run actionlint"
-        run: |
-          tools/actionlint.sh -color
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -8,7 +8,7 @@ jobs:
        runs-on: ubuntu-latest
        steps:
            -   name: Add label
-                uses: actions/github-script@v7
+                uses: actions/github-script@v5
                with:
                    script: |
                        github.rest.issues.addLabels({
--- a/.github/workflows/add_label_ready_comment.yml
+++ b/.github/workflows/add_label_ready_comment.yml
@ -0,0 +1,23 @@
+name: Add Ready Label on Ready Comment
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  add-ready-label:
+    runs-on: ubuntu-latest
+    if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
+    steps:
+        -   name: Add label
+            uses: actions/github-script@v5
+            with:
+                script: |
+                    github.rest.issues.addLabels({
+                        owner: context.repo.owner,
+                        repo: context.repo.repo,
+                        issue_number: context.issue.number,
+                        labels: ['ready']
+                    })
+            env:
+                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@ -17,9 +17,9 @@ jobs:
      matrix:
        python-version: ["3.11"]
    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
@ -30,11 +30,12 @@ jobs:
      run: |
        EXCLUDES=(
            'csrc/moe/topk_softmax_kernels.cu'
-            'csrc/quantization/gguf/ggml-common.h'
-            'csrc/quantization/gguf/dequantize.cuh'
-            'csrc/quantization/gguf/vecdotq.cuh'
-            'csrc/quantization/gguf/mmq.cuh'
-            'csrc/quantization/gguf/mmvq.cuh'
+            'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+            'csrc/punica/bgmv/bgmv_config.h'
+            'csrc/punica/bgmv/bgmv_impl.cuh'
+            'csrc/punica/bgmv/vec_dtypes.cuh'
+            'csrc/punica/punica_ops.cu'
+            'csrc/punica/type_convert.h'
        )
        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
--- a/.github/workflows/matchers/actionlint.json
+++ b/.github/workflows/matchers/actionlint.json
@ -1,17 +0,0 @@
-{
-  "problemMatcher": [
-    {
-      "owner": "actionlint",
-      "pattern": [
-        {
-          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
-          "file": 1,
-          "line": 2,
-          "column": 3,
-          "message": 4,
-          "code": 5
-        }
-      ]
-    }
-  ]
-}
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -11,25 +11,43 @@ on:
      - main

 jobs:
-  mypy:
+  ruff:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install mypy==1.11.1
+        pip install mypy==1.9.0
        pip install types-setuptools
        pip install types-PyYAML
        pip install types-requests
        pip install types-setuptools
    - name: Mypy
      run: |
-        tools/mypy.sh
+        mypy tests --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/attention --config-file pyproject.toml
+        mypy vllm/core --config-file pyproject.toml
+        mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/entrypoints --config-file pyproject.toml
+        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/inputs --config-file pyproject.toml
+        mypy vllm/logging --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+        mypy vllm/model_executor  --config-file pyproject.toml
+        mypy vllm/multimodal --config-file pyproject.toml
+        mypy vllm/platforms --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
+        mypy vllm/transformers_utils --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
+        mypy vllm/worker --config-file pyproject.toml
+
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -21,16 +21,16 @@ jobs:
      upload_url: ${{ steps.create_release.outputs.upload_url }}
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Extract branch info
        shell: bash
        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV

      - name: Create Release
        id: create_release
-        uses: "actions/github-script@v7"
+        uses: "actions/github-script@v6"
        env:
          RELEASE_TAG: ${{ env.release_tag }}
        with:
@ -48,13 +48,13 @@ jobs:
      fail-fast: false
      matrix:
          os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
+          python-version: ['3.8', '3.9', '3.10', '3.11']
+          pytorch-version: ['2.3.1']  # Must be the most recent version that meets requirements-cuda.txt.
          cuda-version: ['11.8', '12.1']

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Setup ccache
        uses: hendrikmuhs/ccache-action@v1.2
@ -68,7 +68,7 @@ jobs:
          bash -x .github/workflows/scripts/env.sh

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
            python-version: ${{ matrix.python-version }}

@ -86,10 +86,10 @@ jobs:
          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
        run: |
          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
          asset_name=${wheel_name//"linux"/"manylinux1"}
-          echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
-          echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          echo "asset_name=${asset_name}" >> $GITHUB_ENV

      - name: Upload Release Asset
        uses: actions/upload-release-asset@v1
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -8,14 +8,14 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Remind to run full CI on PR
-        uses: actions/github-script@v7
+        uses: actions/github-script@v6
        with:
          script: |
            github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
            })
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -15,20 +15,20 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install -r requirements-lint.txt
+        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
    - name: Analysing the code with ruff
      run: |
-        ruff check .
+        ruff .
    - name: Spelling check with codespell
      run: |
        codespell --toml pyproject.toml
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -1,5 +1,4 @@
 #!/bin/bash
-set -eux

 python_executable=python$1
 cuda_home=/usr/local/cuda-$2
@ -9,15 +8,14 @@ PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH

 # Install requirements
-$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
+$python_executable -m pip install wheel packaging
+$python_executable -m pip install -r requirements-cuda.txt

 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
+# Make sure punica is built for the release (for LoRA)
+export VLLM_INSTALL_PUNICA_KERNELS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
-
-bash tools/check_repo.sh
-
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -14,11 +14,11 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,5 @@
-# version file generated by setuptools-scm
-/vllm/_version.py
-
-# vllm-flash-attn built from source
-vllm/vllm_flash_attn/
+# vllm commit id, generated by setup.py
+vllm/commit_id.py

 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -15,8 +12,6 @@ __pycache__/
 # Distribution / packaging
 .Python
 build/
-cmake-build-*/
-CMakeUserPresets.json
 develop-eggs/
 dist/
 downloads/
@ -33,7 +28,6 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
-/.deps/

 # PyInstaller
 #  Usually these files are written by a python script from a template
@ -93,9 +87,6 @@ target/
 profile_default/
 ipython_config.py

-# generated files
-**/generated/**
-
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
@ -198,7 +189,4 @@ _build/
 hip_compat.h

 # Benchmark dataset
-benchmarks/*.json
-
-# Linting
-actionlint
+*.json
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -10,13 +10,12 @@ build:

 sphinx:
   configuration: docs/source/conf.py
-   fail_on_warning: true

 # If using Sphinx, optionally build your docs in additional formats such as PDF
-formats: []
+formats:
+   - pdf

 # Optionally declare the Python requirements required to build your docs
 python:
   install:
   - requirements: docs/requirements-docs.txt
-
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,16 +1,5 @@
-cmake_minimum_required(VERSION 3.26)
+cmake_minimum_required(VERSION 3.21)

-# When building directly using CMake, make sure you run the install step
-# (it places the .so files in the correct location).
-#
-# Example:
-# mkdir build && cd build
-# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
-# cmake --build . --target install
-#
-# If you want to only build one target, make sure to install it manually:
-# cmake --build . --target _C
-# cmake --install . --component _C
 project(vllm_extensions LANGUAGES CXX)

 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
@ -21,17 +10,11 @@ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)

-# Suppress potential warnings about unused manually-specified variables
-set(ignoreMe "${VLLM_PYTHON_PATH}")
-
-# Prevent installation of dependencies (cutlass) by default.
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
-
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")

 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
@ -49,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")

 #
@ -83,24 +66,6 @@ endif()
 #
 find_package(Torch REQUIRED)

-#
-message(STATUS "Enabling core extension.")
-
-# Define _core_C extension
-#  built for (almost) every target platform, (excludes TPU and Neuron)
-
-set(VLLM_EXT_SRC
-  "csrc/core/torch_bindings.cpp")
-
-define_gpu_extension_target(
-  _core_C
-  DESTINATION vllm
-  LANGUAGE CXX
-  SOURCES ${VLLM_EXT_SRC}
-  COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-  USE_SABI 3
-  WITH_SOABI)
-
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -109,7 +74,7 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
    else()
-        return()
+        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
    endif()
    return()
 endif()
@ -143,32 +108,14 @@ else()
  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()

-
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  #
-  # For cuda we want to be able to control which architectures we compile for on 
-  # a per-file basis in order to cut down on compile time. So here we extract
-  # the set of architectures we want to compile for and remove the from the 
-  # CMAKE_CUDA_FLAGS so that they are not applied globally.
-  #
-  clear_cuda_arches(CUDA_ARCH_FLAGS)
-  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
-  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
-  # Filter the target architectures by the supported supported archs
-  # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS 
-    "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
-  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
-else()
-  #
-  # For other GPU targets override the GPU architectures detected by cmake/torch
-  # and filter them by the supported versions for the current language.
-  # The final set of arches is stored in `VLLM_GPU_ARCHES`.
-  #
-  override_gpu_arches(VLLM_GPU_ARCHES
-    ${VLLM_GPU_LANG}
-    "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
-endif()
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# the supported versions for the current language.
+# The final set of arches is stored in `VLLM_GPU_ARCHES`.
+#
+override_gpu_arches(VLLM_GPU_ARCHES
+  ${VLLM_GPU_LANG}
+  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")

 #
 # Query torch for additional GPU compilation flags for the given
@ -184,19 +131,8 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()

-
 #
-# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
-# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
-#
-include(FetchContent)
-get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
-set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
-message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
-
-#
-# Define other extension targets
+# Define extension targets
 #

 #
@ -209,6 +145,7 @@ set(VLLM_EXT_SRC
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
+  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
@ -218,188 +155,44 @@ set(VLLM_EXT_SRC
  "csrc/torch_bindings.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
-
-  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
-
+  include(FetchContent)
+  SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
  FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
-        GIT_PROGRESS TRUE
-
-        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
-        # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
-        # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW TRUE
+        # CUTLASS 3.5.0
+        GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
  )
  FetchContent_MakeAvailable(cutlass)

  list(APPEND VLLM_EXT_SRC
-    "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
-    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/quantization/gguf/gguf_kernel.cu"
+    "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+    "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+    "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
+    "csrc/quantization/fp8/fp8_marlin.cu"
    "csrc/custom_all_reduce.cu"
-    "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
-
-  set_gencode_flags_for_srcs(
-    SRCS "${VLLM_EXT_SRC}"
-    CUDA_ARCHS "${CUDA_ARCHS}")
-
-  # Only build Marlin kernels if we are building for at least some compatible archs.
-  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
-  # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
-  if (MARLIN_ARCHS)
-    set(MARLIN_SRCS 
-       "csrc/quantization/fp8/fp8_marlin.cu"
-       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
-       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_SRCS}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
-    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
-  else()
-    message(STATUS "Not building Marlin kernels as no compatible archs found"
-                   "in CUDA target architectures")
-  endif()
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")

  #
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
-    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
-                     "Hopper.")
-    else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
-    # build any 3x kernels
-    set(SCALED_MM_3X_ARCHS)
+  # The CUTLASS kernels for Hopper require sm90a to be enabled.
+  # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
+  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+    set_source_files_properties(
+          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+          PROPERTIES
+          COMPILE_FLAGS
+          "-gencode arch=compute_90a,code=sm_90a")
  endif()

-  #
-  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
-  # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
-    "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
-  # subtract out the archs that are already built for 3x
-  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
-  if (SCALED_MM_2X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
-    message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
-  else()
-    if (SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c2x as all archs are already built"
-                     " for and covered by scaled_mm_c3x")
-    else()
-      message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
-                    "in CUDA target architectures")
-    endif()
-  endif()
-
-
-  #
-  # Machete kernels
-
-  # The machete kernels only work on hopper and require CUDA 12.0 or later.
-  # Only build Machete kernels if we are building for something compatible with sm90a
-  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
-    #
-    # For the Machete kernels we automatically generate sources for various 
-    # preselected input type pairs and schedules.
-    # Generate sources:
-    set(MACHETE_GEN_SCRIPT 
-      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
-    file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
-
-    message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
-
-    if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
-      execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env 
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
-          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
-        RESULT_VARIABLE machete_generation_result
-        OUTPUT_VARIABLE machete_generation_output
-        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
-        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
-      )
-
-      if (NOT machete_generation_result EQUAL 0)
-        message(FATAL_ERROR "Machete generation failed."
-                            " Result: \"${machete_generation_result}\"" 
-                            "\nCheck the log for details: "
-                            "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
-      else()
-        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} 
-            CACHE STRING "Last run machete generate script hash" FORCE)
-        message(STATUS "Machete generation completed successfully.")
-      endif()
-    else()
-      message(STATUS "Machete generation script has not changed, skipping generation.")
-    endif()
-
-    # Add machete generated sources
-    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
-    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
-
-    # forward compatible
-    set_gencode_flags_for_srcs(
-      SRCS "${MACHETE_GEN_SOURCES}"
-      CUDA_ARCHS "${MACHETE_ARCHS}")
-
-    list(APPEND VLLM_EXT_SRC
-      csrc/quantization/machete/machete_pytorch.cu)
-
-    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 
-        AND MACHETE_ARCHS)
-      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running w4a16 quantized models on "
-                     "Hopper.")
-    else()
-      message(STATUS "Not building Machete kernels as no compatible archs "
-                     "found in CUDA target architectures")
-    endif()
-  endif()
-# if CUDA endif
 endif()

-message(STATUS "Enabling C extension.")
 define_gpu_extension_target(
  _C
  DESTINATION vllm
@ -407,16 +200,10 @@ define_gpu_extension_target(
  SOURCES ${VLLM_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)

-# If CUTLASS is compiled on NVCC >= 12.5, it by default uses 
-# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the 
-# driver API. This causes problems when linking with earlier versions of CUDA.
-# Setting this variable sidesteps the issue by calling the driver directly.
-target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
-
 #
 # _moe_C extension
 #
@ -425,36 +212,6 @@ set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/topk_softmax_kernels.cu")

-set_gencode_flags_for_srcs(
-  SRCS "${VLLM_MOE_EXT_SRC}"
-  CUDA_ARCHS "${CUDA_ARCHS}")
-
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
-  if (MARLIN_MOE_ARCHS)
-    set(MARLIN_MOE_SRC
-        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
-        "csrc/moe/marlin_moe_ops.cu")
-
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_MOE_SRC}"
-      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
-
-    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
-    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
-  else()
-    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
-                   "in CUDA target architectures")
-  endif()
-endif()
-
-message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
  DESTINATION vllm
@ -465,96 +222,90 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)

-if(VLLM_GPU_LANG STREQUAL "HIP")
-  #
-  # _rocm_C extension
-  #
-  set(VLLM_ROCM_EXT_SRC
-    "csrc/rocm/torch_bindings.cpp"
-    "csrc/rocm/attention.cu")
+#
+# _punica_C extension
+#

+set(VLLM_PUNICA_EXT_SRC
+  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
+  "csrc/punica/punica_ops.cu"
+  "csrc/punica/torch_bindings.cpp")
+
+#
+# Copy GPU compilation flags+update for punica
+#
+set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
+  "-D__CUDA_NO_HALF_OPERATORS__"
+  "-D__CUDA_NO_HALF_CONVERSIONS__"
+  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+  "-D__CUDA_NO_HALF2_OPERATORS__")
+
+#
+# Filter out CUDA architectures < 8.0 for punica.
+#
+if (${VLLM_GPU_LANG} STREQUAL "CUDA")
+  set(VLLM_PUNICA_GPU_ARCHES)
+  foreach(ARCH ${VLLM_GPU_ARCHES})
+    string_to_ver(CODE_VER ${ARCH})
+    if (CODE_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
+    endif()
+  endforeach()
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
+  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+endif()
+
+if (VLLM_PUNICA_GPU_ARCHES)
  define_gpu_extension_target(
-    _rocm_C
+    _punica_C
    DESTINATION vllm
    LANGUAGE ${VLLM_GPU_LANG}
-    SOURCES ${VLLM_ROCM_EXT_SRC}
-    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
-    ARCHITECTURES ${VLLM_GPU_ARCHES}
+    SOURCES ${VLLM_PUNICA_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
    USE_SABI 3
    WITH_SOABI)
-endif()
-
-# vllm-flash-attn currently only supported on CUDA
-if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
-  return()
-endif ()
-
-# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target  
-# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the 
-# arches in the CUDA case (and instead set the gencodes on a per file basis) 
-# we need to manually set VLLM_GPU_ARCHES here.
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  foreach(_ARCH ${CUDA_ARCHS})
-    string(REPLACE "." "" _ARCH "${_ARCH}")
-    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
-  endforeach()
-endif()
-
-#
-# Build vLLM flash attention from source
-#
-# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
-# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
-# They should be identical but if they aren't, this is a massive footgun.
-#
-# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
-# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
-# If no component is specified, vllm-flash-attn is still installed.
-
-# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
-# This is to enable local development of vllm-flash-attn within vLLM.
-# It can be set as an environment variable or passed as a cmake argument.
-# The environment variable takes precedence.
-if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
-  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
-endif()
-
-if(VLLM_FLASH_ATTN_SRC_DIR)
-  FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
 else()
-  FetchContent_Declare(
-          vllm-flash-attn
-          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
-          GIT_PROGRESS TRUE
-  )
+  message(WARNING "Unable to create _punica_C target because none of the "
+    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
 endif()

-# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
-set(VLLM_PARENT_BUILD ON)
+#
+# Add the `default` target which detects which extensions should be
+# built based on platform/architecture.  This is the same logic that
+# setup.py uses to select which extensions should be built and should
+# be kept in sync.
+#
+# The `default` target makes direct use of cmake easier since knowledge
+# of which extensions are supported has been factored in, e.g.
+#
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
+# cmake --build . --target default
+#
+add_custom_target(default)

-# Ensure the vllm/vllm_flash_attn directory exists before installation
-install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+  message(STATUS "Enabling C extension.")
+  add_dependencies(default _C)

-# Make sure vllm-flash-attn install rules are nested under vllm/
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
-install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
-install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
+  message(STATUS "Enabling moe extension.")
+  add_dependencies(default _moe_C)

-# Fetch the vllm-flash-attn library
-FetchContent_MakeAvailable(vllm-flash-attn)
-message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
-
-# Restore the install prefix
-install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
-
-# Copy over the vllm-flash-attn python files
-install(
-        DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-        DESTINATION vllm/vllm_flash_attn
-        COMPONENT vllm_flash_attn_c
-        FILES_MATCHING PATTERN "*.py"
-)
-
-# Nothing after vllm-flash-attn, see comment about macros above
+  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
+  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
+  # there are supported target arches.
+  if (VLLM_PUNICA_GPU_ARCHES AND
+      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
+    message(STATUS "Enabling punica extension.")
+    add_dependencies(default _punica_C)
+  endif()
+endif()
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -1,128 +0,0 @@
-
-# vLLM Code of Conduct
-
-## Our Pledge
-
-We as members, contributors, and leaders pledge to make participation in our
-community a harassment-free experience for everyone, regardless of age, body
-size, visible or invisible disability, ethnicity, sex characteristics, gender
-identity and expression, level of experience, education, socioeconomic status,
-nationality, personal appearance, race, caste, color, religion, or sexual
-identity and orientation.
-
-We pledge to act and interact in ways that contribute to an open, welcoming,
-diverse, inclusive, and healthy community.
-
-## Our Standards
-
-Examples of behavior that contributes to a positive environment for our
-community include:
-
-* Demonstrating empathy and kindness toward other people
-* Being respectful of differing opinions, viewpoints, and experiences
-* Giving and gracefully accepting constructive feedback
-* Accepting responsibility and apologizing to those affected by our mistakes,
-  and learning from the experience
-* Focusing on what is best not just for us as individuals, but for the overall
-  community
-
-Examples of unacceptable behavior include:
-
-* The use of sexualized language or imagery, and sexual attention or advances of
-  any kind
-* Trolling, insulting or derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or email address,
-  without their explicit permission
-* Other conduct which could reasonably be considered inappropriate in a
-  professional setting
-
-## Enforcement Responsibilities
-
-Community leaders are responsible for clarifying and enforcing our standards of
-acceptable behavior and will take appropriate and fair corrective action in
-response to any behavior that they deem inappropriate, threatening, offensive,
-or harmful.
-
-Community leaders have the right and responsibility to remove, edit, or reject
-comments, commits, code, wiki edits, issues, and other contributions that are
-not aligned to this Code of Conduct, and will communicate reasons for moderation
-decisions when appropriate.
-
-## Scope
-
-This Code of Conduct applies within all community spaces, and also applies when
-an individual is officially representing the community in public spaces.
-Examples of representing our community include using an official email address,
-posting via an official social media account, or acting as an appointed
-representative at an online or offline/IRL event.
-
-## Enforcement
-
-Instances of abusive, harassing, or otherwise unacceptable behavior may be
-reported to the community leaders responsible for enforcement in the #code-of-conduct
-channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
-All complaints will be reviewed and investigated promptly and fairly.
-
-All community leaders are obligated to respect the privacy and security of the
-reporter of any incident.
-
-## Enforcement Guidelines
-
-Community leaders will follow these Community Impact Guidelines in determining
-the consequences for any action they deem in violation of this Code of Conduct:
-
-### 1. Correction
-
-**Community Impact**: Use of inappropriate language or other behavior deemed
-unprofessional or unwelcome in the community.
-
-**Consequence**: A private, written warning from community leaders, providing
-clarity around the nature of the violation and an explanation of why the
-behavior was inappropriate. A public apology may be requested.
-
-### 2. Warning
-
-**Community Impact**: A violation through a single incident or series of
-actions.
-
-**Consequence**: A warning with consequences for continued behavior. No
-interaction with the people involved, including unsolicited interaction with
-those enforcing the Code of Conduct, for a specified period of time. This
-includes avoiding interactions in community spaces as well as external channels
-like social media. Violating these terms may lead to a temporary or permanent
-ban.
-
-### 3. Temporary Ban
-
-**Community Impact**: A serious violation of community standards, including
-sustained inappropriate behavior.
-
-**Consequence**: A temporary ban from any sort of interaction or public
-communication with the community for a specified period of time. No public or
-private interaction with the people involved, including unsolicited interaction
-with those enforcing the Code of Conduct, is allowed during this period.
-Violating these terms may lead to a permanent ban.
-
-### 4. Permanent Ban
-
-**Community Impact**: Demonstrating a pattern of violation of community
-standards, including sustained inappropriate behavior, harassment of an
-individual, or aggression toward or disparagement of classes of individuals.
-
-**Consequence**: A permanent ban from any sort of public interaction within the
-community.
-
-## Attribution
-
-This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
-version 2.1, available at
-[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
-
-Community Impact Guidelines were inspired by
-[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
-
-For answers to common questions about this code of conduct, see the
-[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
-[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
-
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,23 +1,30 @@
 # Contributing to vLLM

-Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
+Thank you for your interest in contributing to vLLM!
+Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
+There are several ways you can contribute to the project:

 - Identify and report any issues or bugs.
- Request or add support for a new model.
+- Request or add a new model.
 - Suggest or implement new features.
- Improve documentation or contribute a how-to guide. 

-We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
+However, remember that contributions aren't just about code.
+We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.

-Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
+Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
+Talk about it in your blog posts, highlighting how it's driving your incredible projects.
+Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.


-## Developing
+## Setup for development

-Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
+### Build from source

+```bash
+pip install -e .  # This may take several minutes.
+```

-## Testing
+### Testing

 ```bash
 pip install -r requirements-dev.txt
@ -29,16 +36,15 @@ mypy
 # Unit tests
 pytest tests/
 ```
-**Note:** Currently, the repository does not pass the ``mypy`` tests.
+**Note:** Currently, the repository does not pass the mypy tests.

-## Contribution Guidelines

-### Issues
+## Contributing Guidelines

-If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
+### Issue Reporting

-> [!IMPORTANT]
-> If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability).
+If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
+If not, please file a new issue, providing as much relevant information as possible.

 ### Pull Requests & Code Reviews

@ -47,4 +53,4 @@ Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE
 ### Thank You

 Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
-All of your contributions help make vLLM a great tool and community for everyone!
+Your contributions make vLLM a great tool for everyone!
--- a/138
+++ b/138
@ -9,31 +9,28 @@ ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
+
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.12
+ARG PYTHON_VERSION=3.10
+
 ENV DEBIAN_FRONTEND=noninteractive

-# Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo \
+    && apt-get install -y ccache software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
+    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && python3 --version

-# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
-# as it was causing spam when compiling the CUTLASS kernels
-RUN apt-get install -y gcc-10 g++-10
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
-RUN <<EOF
-gcc --version
-EOF
+RUN apt-get update -y \
+    && apt-get install -y git curl sudo
+
+# Install pip s.t. it will be compatible with our PYTHON_VERSION
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
+RUN python3 -m pip --version

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -49,6 +46,9 @@ COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-cuda.txt

+COPY requirements-mamba.txt requirements-mamba.txt
+RUN python3 -m pip install packaging
+RUN python3 -m pip install -r requirements-mamba.txt

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -56,24 +56,31 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # see https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-# Override the arch list for flash-attn to reduce the binary size
-ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
-ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### BASE BUILD IMAGE ####################

 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build

+ARG PYTHON_VERSION=3.10
+
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt

 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-build.txt

-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+# install compiler cache to speed up compilation leveraging local or remote caching
+RUN apt-get update -y && apt-get install -y ccache
+
+# files and directories related to build wheels
+COPY csrc csrc
+COPY setup.py setup.py
+COPY cmake cmake
+COPY CMakeLists.txt CMakeLists.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+COPY pyproject.toml pyproject.toml
+COPY vllm vllm

 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@ -81,24 +88,27 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+ARG buildkite_commit
+ENV BUILDKITE_COMMIT=${buildkite_commit}

 ARG USE_SCCACHE
-ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
-ARG SCCACHE_REGION_NAME=us-west-2
-ARG SCCACHE_S3_NO_CREDENTIALS=0
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
        && tar -xzf sccache.tar.gz \
        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
-        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
-        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
-        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
-        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && if [ "$CUDA_VERSION" = "11.8.0" ]; then \
+            export SCCACHE_BUCKET=vllm-build-sccache-2; \
+           else \
+            export SCCACHE_BUCKET=vllm-build-sccache; \
+           fi \
+        && export SCCACHE_REGION=us-west-2 \
        && export CMAKE_BUILD_TYPE=Release \
        && sccache --show-stats \
        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
@ -108,22 +118,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git  \
    if [ "$USE_SCCACHE" != "1" ]; then \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi

-# Check the size of the wheel if RUN_WHEEL_CHECK is true
+# check the size of the wheel, we cannot upload wheels larger than 100MB
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
-# Default max size of the wheel is 250MB
-ARG VLLM_MAX_SIZE_MB=250
-ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
-ARG RUN_WHEEL_CHECK=true
-RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
-        python3 check-wheel-size.py dist; \
-    else \
-        echo "Skipping wheel size check."; \
-    fi
+RUN python3 check-wheel-size.py dist
+
 #################### EXTENSION Build IMAGE ####################

 #################### DEV IMAGE ####################
@ -136,31 +138,45 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-dev.txt

 #################### DEV IMAGE ####################
+#################### MAMBA Build IMAGE ####################
+FROM dev as mamba-builder
+# max jobs used for build
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+
+WORKDIR /usr/src/mamba
+
+COPY requirements-mamba.txt requirements-mamba.txt
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel -r requirements-mamba.txt \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### MAMBA Build IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.12
+ARG PYTHON_VERSION=3.10
 WORKDIR /vllm-workspace
-ENV DEBIAN_FRONTEND=noninteractive

-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
-
-# Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && apt-get install -y ccache software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && python3 --version
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git vim curl libibverbs-dev
+
+# Install pip s.t. it will be compatible with our PYTHON_VERSION
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
+RUN python3 -m pip --version

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -173,10 +189,12 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
    --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install dist/*.whl --verbose

+RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
+
 RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
-COPY examples examples
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################


@ -206,7 +224,7 @@ FROM vllm-base AS vllm-openai

 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -2,61 +2,37 @@

 FROM ubuntu:22.04 AS cpu-test-1

-ENV CCACHE_DIR=/root/.cache/ccache
-
-ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
-
-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+RUN apt-get update -y \
+    && apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp
+RUN pip install intel-openmp

-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"

 RUN echo 'ulimit -c 0' >> ~/.bashrc

-RUN pip install intel_extension_for_pytorch==2.4.0
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl

-WORKDIR /workspace
-
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
-    pip install --upgrade pip && \
-    pip install -r requirements-build.txt
+RUN pip install --upgrade pip \
+    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy

 FROM cpu-test-1 AS build

+COPY ./ /workspace/vllm
+
 WORKDIR /workspace/vllm

-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
-    pip install -v -r requirements-cpu.txt
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu

 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}

-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl && \
-    rm -rf dist
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

 WORKDIR /workspace/

--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -1,41 +1,36 @@
 # default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
+ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"

 FROM $BASE_IMAGE

 RUN echo "Base image is $BASE_IMAGE"

 # Install some basic utilities
-RUN apt-get update && \
-    apt-get install -y \
-        git \
-        python3 \
-        python3-pip \
-        ffmpeg libsm6 libxext6 libgl1
+RUN apt-get update && apt-get install python3 python3-pip -y

 ### Mount Point ###
 # When launching the container, mount the code directory to /app
 ARG APP_MOUNT=/app
 VOLUME [ ${APP_MOUNT} ]
-WORKDIR ${APP_MOUNT}/vllm
+WORKDIR ${APP_MOUNT}

 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U

-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+COPY ./vllm /app/vllm/vllm
+COPY ./setup.py /app/vllm/setup.py
+COPY ./requirements-common.txt /app/vllm/requirements-common.txt
+COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt

-RUN python3 -m pip install -U \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-        -r requirements-neuron.txt
+RUN cd /app/vllm \
+    && python3 -m pip install -U -r requirements-neuron.txt

 ENV VLLM_TARGET_DEVICE neuron
-RUN --mount=type=bind,source=.git,target=.git \
-    pip install --no-build-isolation -v -e . \
+RUN cd /app/vllm \
+    && pip install -e . \
+    && cd ..

 CMD ["/bin/bash"]
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -1,23 +1,24 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.

-FROM ubuntu:22.04 AS dev
+FROM ubuntu:20.04 AS dev

 RUN apt-get update -y && \
-    apt-get install -y \
-        git python3-pip \
-        ffmpeg libsm6 libxext6 libgl1
+    apt-get install -y python3-pip git
 WORKDIR /workspace

-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+# copy requirements
+COPY requirements-build.txt /workspace/vllm/
+COPY requirements-common.txt /workspace/vllm/
+COPY requirements-openvino.txt /workspace/vllm/
+
+COPY vllm/ /workspace/vllm/vllm
+COPY setup.py /workspace/vllm/

 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/

 COPY examples/ /workspace/vllm/examples
 COPY benchmarks/ /workspace/vllm/benchmarks
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -2,35 +2,21 @@ FROM mambaorg/micromamba
 ARG MAMBA_DOCKERFILE_ACTIVATE=1
 USER root

-ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
-
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes

 COPY ./ /workspace/vllm

 WORKDIR /workspace/vllm
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi

 # These packages will be in rocketce eventually
-RUN --mount=type=cache,target=/root/.cache/pip  \
-    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-        torch==2.3.1 \
-        -r requirements-cpu.txt \
-        xformers uvloop==0.20.0
+RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing

-RUN --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py install
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

-WORKDIR /workspace/
-
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+WORKDIR /vllm-workspace
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -1,5 +1,5 @@
-# Default ROCm 6.2 base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
+# Default ROCm 6.1 base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"

 # Default ROCm ARCHes to build vLLM for.
 ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
@ -7,12 +7,18 @@ ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
 # Whether to install CK-based flash-attention
 # If 0, will not install flash-attention
 ARG BUILD_FA="1"
+# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
+# If this succeeds, we use the downloaded wheel and skip building flash-attention.
+# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
+# architectures specified in `FA_GFX_ARCHS`
+ARG TRY_FA_WHEEL="1"
+ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-ARG FA_BRANCH="3cea2fb"
+ARG FA_BRANCH="23a2b1c2"

 # Whether to build triton on rocm
 ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="e192dba"
+ARG TRITON_BRANCH="e0fc12c"

 ### Base image build stage
 FROM $BASE_IMAGE AS base
@ -44,17 +50,14 @@ RUN python3 -m pip install --upgrade pip
 # Remove sccache so it doesn't interfere with ccache
 # TODO: implement sccache support across components
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
-
-# Install torch == 2.6.0 on ROCm
-RUN --mount=type=cache,target=/root/.cache/pip \
-    case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.2"*) \
+# Install torch == 2.5.0 on ROCm
+RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.1"*) \
            python3 -m pip uninstall -y torch torchvision \
-            && python3 -m pip install --pre \
-                torch==2.6.0.dev20240918 \
-                setuptools-scm>=8 \
-                torchvision==0.20.0.dev20240918 \
-                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
+            && python3 -m pip install --no-cache-dir --pre \
+                torch==2.5.0.dev20240710 \
+                torchvision==0.20.0.dev20240710 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
        *) ;; esac

 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
@ -76,18 +79,25 @@ RUN cd /opt/rocm/share/amd_smi \
 ### Flash-Attention wheel build stage
 FROM base AS build_fa
 ARG BUILD_FA
+ARG TRY_FA_WHEEL
+ARG FA_WHEEL_URL
 ARG FA_GFX_ARCHS
 ARG FA_BRANCH
 # Build ROCm flash-attention wheel if `BUILD_FA = 1`
 RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_FA" = "1" ]; then \
-        mkdir -p libs \
-        && cd libs \
-        && git clone https://github.com/ROCm/flash-attention.git \
-        && cd flash-attention \
-        && git checkout "${FA_BRANCH}" \
-        && git submodule update --init \
-        && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
+        if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
+            # If a suitable wheel exists, we download it instead of building FA
+            mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
+        else \
+            mkdir -p libs \
+            && cd libs \
+            && git clone https://github.com/ROCm/flash-attention.git \
+            && cd flash-attention \
+            && git checkout "${FA_BRANCH}" \
+            && git submodule update --init \
+            && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
+        fi; \
    # Create an empty directory otherwise as later build stages expect one
    else mkdir -p /install; \
    fi
@ -102,7 +112,6 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_TRITON" = "1" ]; then \
    mkdir -p libs \
    && cd libs \
-    && python3 -m pip install ninja cmake wheel pybind11 \
    && git clone https://github.com/OpenAI/triton.git \
    && cd triton \
    && git checkout "${TRITON_BRANCH}" \
@ -117,24 +126,28 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
 FROM base AS final
 # Import the vLLM development directory from the build context
 COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 # Package upgrades for useful functionality or to avoid dependency issues
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
-
+    python3 -m pip install --upgrade numba scipy huggingface-hub[cli]

+# Make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 # Silences the HF Tokenizers warning
 ENV TOKENIZERS_PARALLELISM=false

 RUN --mount=type=cache,target=${CCACHE_DIR} \
-    --mount=type=bind,source=.git,target=.git \
    --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -Ur requirements-rocm.txt \
+    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.1"*) \
+            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
+            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
+            # Prevent interference if torch bundles its own HIP runtime
+            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
+        *) ;; esac \
    && python3 setup.py clean --all \
    && python3 setup.py develop

--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@ -1,32 +1,20 @@
-ARG NIGHTLY_DATE="20240828"
+ARG NIGHTLY_DATE="20240713"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"

 FROM $BASE_IMAGE
-WORKDIR /workspace/vllm
-
-# Install some basic utilities
-RUN apt-get update && apt-get install -y \
-    git \
-    ffmpeg libsm6 libxext6 libgl1
+WORKDIR /workspace

+# Install aiohttp separately to avoid build errors.
+RUN pip install aiohttp
+# Install NumPy 1 instead of NumPy 2.
+RUN pip install "numpy<2"
 # Install the TPU and Pallas dependencies.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html

 # Build vLLM.
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
-
+COPY . /workspace/vllm
 ENV VLLM_TARGET_DEVICE="tpu"
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
-    python3 -m pip install \
-        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-        -r requirements-tpu.txt
-RUN python3 setup.py develop
+RUN cd /workspace/vllm && python setup.py develop

 CMD ["/bin/bash"]
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,58 +1,22 @@
-FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
+FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04

 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg

-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends --fix-missing \
-    curl \
-    ffmpeg \
-    git \
-    libsndfile1 \
-    libsm6 \
-    libxext6 \
-    libgl1 \
-    lsb-release \
-    numactl \
-    python3 \
-    python3-dev \
-    python3-pip \
-    # vim \
-    wget
+RUN apt-get update  -y \
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
+
+COPY ./ /workspace/vllm

 WORKDIR /workspace/vllm
-COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
-COPY requirements-common.txt /workspace/vllm/requirements-common.txt

-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install --no-cache-dir \
-    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
-    -r requirements-xpu.txt
+RUN pip install -v -r requirements-xpu.txt

-COPY . .
-ARG GIT_REPO_CHECK
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
-
-ENV VLLM_TARGET_DEVICE=xpu
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
-    python3 setup.py install
+RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install

 CMD ["/bin/bash"]
-
-FROM vllm-base AS vllm-openai
-
-# install additional dependencies for openai api server
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0'
-
-ENV VLLM_USAGE_SOURCE production-docker-image \
-    TRITON_XPU_PROFILE 1
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/README.md
+++ b/README.md
@ -10,14 +10,13 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
+
 </p>

+---

 *Latest News* 🔥
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! 
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
@ -37,12 +36,10 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with **PagedAttention**
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Speculative decoding
- Chunked prefill
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
+- Optimized CUDA kernels

-**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. 
+**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).

 vLLM is flexible and easy to use with:

@ -51,21 +48,20 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Prefix caching support
- Multi-lora support
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
+- (Experimental) Prefix caching support
+- (Experimental) Multi-lora support

 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
 - Mixture-of-Expert LLMs (e.g., Mixtral)
- Embedding Models (e.g. E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)

 Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).

 ## Getting Started

-Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
+Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):

 ```bash
 pip install vllm
@ -103,7 +99,6 @@ vLLM is a community project. Our compute resources for development and testing a
 - Roblox
 - RunPod
 - Sequoia Capital
- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
@ -122,11 +117,3 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
  year={2023}
 }
 ```
-
-## Contact Us
-
-* For technical questions and feature requests, please use Github issues or discussions.
-* For discussing with fellow users, please use Discord.
-* For coordinating contributions and development, please use Slack.
-* For security disclosures, please use Github's security advisory feature.
-* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
--- a/SECURITY.md
+++ b/SECURITY.md
@ -1,11 +0,0 @@
-# Security Policy
-
-## Reporting a Vulnerability
-
-If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
-
-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
-
---
-
-Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -23,9 +23,7 @@ class RequestFuncInput:
    output_len: int
    model: str
    best_of: int = 1
-    logprobs: Optional[int] = None
-    multi_modal_content: Optional[dict] = None
-    ignore_eos: bool = False
+    use_beam_search: bool = False


@dataclass
@ -48,13 +46,13 @@ async def async_request_tgi(
    assert api_url.endswith("generate_stream")

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
        params = {
            "best_of": request_func_input.best_of,
            "max_new_tokens": request_func_input.output_len,
            "do_sample": True,
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
-            # TGI does not accept ignore_eos flag.
        }
        payload = {
            "inputs": request_func_input.prompt,
@ -119,6 +117,7 @@ async def async_request_trt_llm(
    assert api_url.endswith("generate_stream")

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
        assert request_func_input.best_of == 1
        payload = {
            "accumulate_tokens": True,
@ -128,8 +127,6 @@ async def async_request_trt_llm(
            "max_tokens": request_func_input.output_len,
            "stream": True,
        }
-        if request_func_input.ignore_eos:
-            payload["min_length"] = request_func_input.output_len
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

@ -184,6 +181,7 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert request_func_input.best_of == 1
+        assert not request_func_input.use_beam_search

        payload = {
            "prompt": request_func_input.prompt,
@ -227,19 +225,18 @@ async def async_request_openai_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        ("completions", "profile")
-    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+        "completions"
+    ), "OpenAI Completions API URL must end with 'completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
        payload = {
            "model": request_func_input.model,
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
            "best_of": request_func_input.best_of,
            "max_tokens": request_func_input.output_len,
-            "logprobs": request_func_input.logprobs,
            "stream": True,
-            "ignore_eos": request_func_input.ignore_eos,
        }
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
@ -279,9 +276,8 @@ async def async_request_openai_completions(
                                    output.ttft = ttft

                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                output.itl.append(timestamp -
+                                                  most_recent_timestamp)

                                most_recent_timestamp = timestamp
                                generated_text += data["choices"][0]["text"]
@ -312,21 +308,18 @@ async def async_request_openai_chat_completions(
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        content = [{"type": "text", "text": request_func_input.prompt}]
-        if request_func_input.multi_modal_content:
-            content.append(request_func_input.multi_modal_content)
+        assert not request_func_input.use_beam_search
        payload = {
            "model": request_func_input.model,
            "messages": [
                {
                    "role": "user",
-                    "content": content
+                    "content": request_func_input.prompt,
                },
            ],
            "temperature": 0.0,
            "max_tokens": request_func_input.output_len,
            "stream": True,
-            "ignore_eos": request_func_input.ignore_eos,
        }
        headers = {
            "Content-Type": "application/json",
@ -430,5 +423,4 @@ ASYNC_REQUEST_FUNCS = {
    "openai-chat": async_request_openai_chat_completions,
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
-    "sglang": async_request_openai_completions,
 }
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -10,8 +10,8 @@ import torch
 from tqdm import tqdm

 from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
-from vllm.inputs import PromptType
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser

@ -38,6 +38,7 @@ def main(args: argparse.Namespace):
        quantization_param_path=args.quantization_param_path,
        device=args.device,
        ray_workers_use_nsight=args.ray_workers_use_nsight,
+        use_v2_block_manager=args.use_v2_block_manager,
        enable_chunked_prefill=args.enable_chunked_prefill,
        download_dir=args.download_dir,
        block_size=args.block_size,
@ -50,8 +51,9 @@ def main(args: argparse.Namespace):

    sampling_params = SamplingParams(
        n=args.n,
-        temperature=1.0,
+        temperature=0.0 if args.use_beam_search else 1.0,
        top_p=1.0,
+        use_beam_search=args.use_beam_search,
        ignore_eos=True,
        max_tokens=args.output_len,
    )
@ -59,7 +61,7 @@ def main(args: argparse.Namespace):
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
-    dummy_prompts: List[PromptType] = [{
+    dummy_inputs: List[PromptInputs] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]

@ -72,13 +74,13 @@ def main(args: argparse.Namespace):
                    ],
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        str(profile_dir))) as p:
-                llm.generate(dummy_prompts,
+                llm.generate(dummy_inputs,
                             sampling_params=sampling_params,
                             use_tqdm=False)
            print(p.key_averages())
        else:
            start_time = time.perf_counter()
-            llm.generate(dummy_prompts,
+            llm.generate(dummy_inputs,
                         sampling_params=sampling_params,
                         use_tqdm=False)
            end_time = time.perf_counter()
@ -203,11 +205,13 @@ if __name__ == '__main__':
        default=None,
        help=('path to save the pytorch profiler output. Can be visualized '
              'with ui.perfetto.dev or Tensorboard.'))
-    parser.add_argument("--device",
-                        type=str,
-                        default="auto",
-                        choices=DEVICE_OPTIONS,
-                        help='device type for vLLM execution')
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
    parser.add_argument('--block-size',
                        type=int,
                        default=16,
@ -220,6 +224,7 @@ if __name__ == '__main__':
    parser.add_argument("--enable-prefix-caching",
                        action='store_true',
                        help="Enable automatic prefix caching")
+    parser.add_argument('--use-v2-block-manager', action='store_true')
    parser.add_argument(
        "--ray-workers-use-nsight",
        action='store_true',
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -1,45 +1,8 @@
-"""
-Benchmark the efficiency of prefix caching.
-
-This script allows you to benchmark the performance of
-a model with and without prefix caching using either fixed prompts
-or prompts sampled from the ShareGPT dataset.
-
-Fixed example usage:
-    python benchmark_prefix_caching.py \
-        --model meta-llama/Llama-2-7b-chat-hf \
-        --enable-prefix-caching \
-        --num-prompts 1 \
-        --repeat-count 100
-
-ShareGPT example usage:
-    # This command samples 20 prompts with input lengths
-    # between 128 and 256 tokens from the ShareGPT dataset,
-    # then replicates each prompt 5 times.
-    python benchmark_prefix_caching.py \
-        --model meta-llama/Llama-2-7b-chat-hf \
-        --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \
-        --enable-prefix-caching \
-        --num-prompts 20 \
-        --repeat-count 5 \
-        --input-length-range 128:256
-"""
-
-import json
-import random
 import time
-from typing import List, Optional, Tuple
-
-from transformers import PreTrainedTokenizerBase

 from vllm import LLM, SamplingParams
 from vllm.utils import FlexibleArgumentParser

-try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
-    from backend_request_func import get_tokenizer
-
 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501


@ -52,97 +15,19 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
    print(f"cost time {end_time - start_time}")


-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    input_length_range: Tuple[int, int],
-    fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
-    if fixed_output_len is not None and fixed_output_len < 4:
-        raise ValueError("output_len too small")
-
-    # Load the dataset.
-    with open(dataset_path) as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
-    # Shuffle the dataset.
-    random.shuffle(dataset)
-
-    min_len, max_len = input_length_range
-
-    # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for i in range(len(dataset)):
-        if len(filtered_dataset) == num_requests:
-            break
-
-        # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
-        completion_token_ids = tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
-            # Prune too short sequences.
-            continue
-        if min_len <= prompt_len <= max_len:
-            filtered_dataset.append((prompt, prompt_len, output_len))
-
-    return filtered_dataset
-
-
-def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
-                             repeat_count: int,
-                             sort: bool = False) -> List[str]:
-    repeated_requests = requests * repeat_count
-    if sort:
-        repeated_requests.sort(key=lambda x: x[1])
-    else:
-        random.shuffle(repeated_requests)
-    return [req[0] for req in repeated_requests]
-
-
 def main(args):
-    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
-    input_length_range = tuple(map(int, args.input_length_range.split(':')))
-    random.seed(args.seed)
-    if args.dataset_path is not None:
-        print(f"Start to sample {args.num_prompts} prompts"
-              "from {args.dataset_path}")
-        filtered_datasets = sample_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            input_length_range=input_length_range,
-            fixed_output_len=args.output_len,
-        )
-    else:
-        prompt_len = len(tokenizer(PROMPT).input_ids)
-        filtered_datasets = [(PROMPT, prompt_len, args.output_len)
-                             ] * args.num_prompts
-
    llm = LLM(model=args.model,
              tokenizer_mode='auto',
              trust_remote_code=True,
              enforce_eager=True,
+              use_v2_block_manager=args.use_v2_block_manager,
              tensor_parallel_size=args.tensor_parallel_size,
              enable_prefix_caching=args.enable_prefix_caching)

+    num_prompts = 100
+    prompts = [PROMPT] * num_prompts
    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)

-    print("Testing filtered datasets")
-    prompts = repeat_and_sort_requests(filtered_datasets,
-                                       repeat_count=args.repeat_count,
-                                       sort=args.sort)
-
    print("------warm up------")
    test_prefix(
        llm=llm,
@ -160,39 +45,18 @@ def main(args):

 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
-        description=
-        'Benchmark the performance with or without automatic prefix caching.')
+        description='Benchmark the performance with or without automatic '
+        'prefix caching.')
    parser.add_argument('--model',
                        type=str,
                        default='baichuan-inc/Baichuan2-13B-Chat')
-    parser.add_argument("--dataset-path",
-                        type=str,
-                        default=None,
-                        help="Path to the dataset.")
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--output-len', type=int, default=10)
    parser.add_argument('--enable-prefix-caching',
                        action='store_true',
                        help='enable prefix caching')
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help="Number of the prompts sampled from dataset")
-    parser.add_argument('--repeat-count',
-                        type=int,
-                        default=100,
-                        help='Number of times to repeat each prompt')
-    parser.add_argument('--sort',
+    parser.add_argument('--use-v2-block-manager',
                        action='store_true',
-                        help='Sort prompts by input length')
-    parser.add_argument('--input-length-range',
-                        type=str,
-                        default='128:256',
-                        help='Range of input lengths for sampling prompts,'
-                        'specified as "min:max" (e.g., "128:256").')
-    parser.add_argument("--seed",
-                        type=int,
-                        default=0,
-                        help='Random seed for reproducibility')
+                        help='Use BlockSpaceMangerV2')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -1,293 +0,0 @@
-"""Benchmark offline prioritization."""
-import argparse
-import json
-import random
-import time
-from typing import List, Optional, Tuple
-
-from transformers import AutoTokenizer, PreTrainedTokenizerBase
-
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-
-
-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
-    if fixed_output_len is not None and fixed_output_len < 4:
-        raise ValueError("output_len too small")
-
-    # Load the dataset.
-    with open(dataset_path) as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
-    # Shuffle the dataset.
-    random.shuffle(dataset)
-
-    # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for i in range(len(dataset)):
-        if len(filtered_dataset) == num_requests:
-            break
-
-        # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
-        completion_token_ids = tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
-            # Prune too short sequences.
-            continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
-            # Prune too long sequences.
-            continue
-
-        #Select a equi-probable random priority
-        priority = 0 if random.random() < 0.5 else 1
-
-        filtered_dataset.append((prompt, prompt_len, output_len, priority))
-
-    return filtered_dataset
-
-
-def run_vllm(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: str,
-    quantization: Optional[str],
-    tensor_parallel_size: int,
-    seed: int,
-    n: int,
-    trust_remote_code: bool,
-    dtype: str,
-    max_model_len: Optional[int],
-    enforce_eager: bool,
-    kv_cache_dtype: str,
-    quantization_param_path: Optional[str],
-    device: str,
-    enable_prefix_caching: bool,
-    enable_chunked_prefill: bool,
-    max_num_batched_tokens: int,
-    gpu_memory_utilization: float = 0.9,
-    download_dir: Optional[str] = None,
-) -> float:
-    from vllm import LLM, SamplingParams
-    llm = LLM(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        quantization_param_path=quantization_param_path,
-        device=device,
-        enable_prefix_caching=enable_prefix_caching,
-        download_dir=download_dir,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        disable_log_stats=False,
-    )
-
-    # Add the requests to the engine.
-    prompts = []
-    sampling_params = []
-    priority = []
-    for prompt, _, output_len, _priority in requests:
-        prompts.append(prompt)
-        priority.append(_priority)
-        sampling_params.append(
-            SamplingParams(
-                n=n,
-                temperature=1.0,
-                top_p=1.0,
-                ignore_eos=True,
-                max_tokens=output_len,
-            ))
-
-    start = time.perf_counter()
-    llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
-    end = time.perf_counter()
-    return end - start
-
-
-def main(args: argparse.Namespace):
-    print(args)
-    random.seed(args.seed)
-
-    # Sample the requests.
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code)
-    if args.dataset is None:
-        # Synthesize a prompt with the given input length.
-        prompt = "hi" * (args.input_len - 1)
-        requests = [(prompt, args.input_len, args.output_len)
-                    for _ in range(args.num_prompts)]
-    else:
-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
-                                   args.output_len)
-
-    if args.backend == "vllm":
-        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
-                                args.quantization, args.tensor_parallel_size,
-                                args.seed, args.n, args.trust_remote_code,
-                                args.dtype, args.max_model_len,
-                                args.enforce_eager, args.kv_cache_dtype,
-                                args.quantization_param_path, args.device,
-                                args.enable_prefix_caching,
-                                args.enable_chunked_prefill,
-                                args.max_num_batched_tokens,
-                                args.gpu_memory_utilization, args.download_dir)
-    else:
-        raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(prompt_len + output_len
-                           for _, prompt_len, output_len, priority in requests)
-    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
-
-    # Output JSON results if specified
-    if args.output_json:
-        results = {
-            "elapsed_time": elapsed_time,
-            "num_requests": len(requests),
-            "total_num_tokens": total_num_tokens,
-            "requests_per_second": len(requests) / elapsed_time,
-            "tokens_per_second": total_num_tokens / elapsed_time,
-        }
-        with open(args.output_json, "w") as f:
-            json.dump(results, f, indent=4)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
-    parser.add_argument("--backend",
-                        type=str,
-                        choices=["vllm", "hf", "mii"],
-                        default="vllm")
-    parser.add_argument("--dataset",
-                        type=str,
-                        default=None,
-                        help="Path to the dataset.")
-    parser.add_argument("--input-len",
-                        type=int,
-                        default=None,
-                        help="Input prompt length for each request")
-    parser.add_argument("--output-len",
-                        type=int,
-                        default=None,
-                        help="Output length for each request. Overrides the "
-                        "output length from the dataset.")
-    parser.add_argument("--model", type=str, default="facebook/opt-125m")
-    parser.add_argument("--tokenizer", type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=[*QUANTIZATION_METHODS, None],
-                        default=None)
-    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
-    parser.add_argument("--n",
-                        type=int,
-                        default=1,
-                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--num-prompts",
-                        type=int,
-                        default=200,
-                        help="Number of prompts to process.")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    parser.add_argument('--gpu-memory-utilization',
-                        type=float,
-                        default=0.9,
-                        help='the fraction of GPU memory to be used for '
-                        'the model executor, which can range from 0 to 1.'
-                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument("--enforce-eager",
-                        action="store_true",
-                        help="enforce eager execution")
-    parser.add_argument(
-        '--kv-cache-dtype',
-        type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-        default="auto",
-        help='Data type for kv cache storage. If "auto", will use model '
-        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-    parser.add_argument(
-        '--quantization-param-path',
-        type=str,
-        default=None,
-        help='Path to the JSON file containing the KV cache scaling factors. '
-        'This should generally be supplied, when KV cache dtype is FP8. '
-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
-        'instead supported for common inference criteria.')
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda",
-        choices=["cuda", "cpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
-    parser.add_argument(
-        "--enable-prefix-caching",
-        action='store_true',
-        help="enable automatic prefix caching for vLLM backend.")
-    parser.add_argument("--enable-chunked-prefill",
-                        action='store_true',
-                        help="enable chunked prefill for vLLM backend.")
-    parser.add_argument('--max-num-batched-tokens',
-                        type=int,
-                        default=None,
-                        help='maximum number of batched tokens per '
-                        'iteration')
-    parser.add_argument('--download-dir',
-                        type=str,
-                        default=None,
-                        help='directory to download and load the weights, '
-                        'default to the default cache dir of huggingface')
-    parser.add_argument(
-        '--output-json',
-        type=str,
-        default=None,
-        help='Path to save the throughput results in JSON format.')
-
-    args = parser.parse_args()
-    if args.tokenizer is None:
-        args.tokenizer = args.model
-    if args.dataset is None:
-        assert args.input_len is not None
-        assert args.output_len is not None
-    else:
-        assert args.input_len is None
-
-    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -1,4 +1,4 @@
-r"""Benchmark online serving throughput.
+"""Benchmark online serving throughput.

 On the server side, run one of the following commands:
    vLLM OpenAI API server
@ -24,8 +24,6 @@ On the client side, run:
 """
 import argparse
 import asyncio
-import base64
-import io
 import json
 import os
 import random
@ -33,13 +31,11 @@ import time
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple

 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                  RequestFuncOutput)
-from datasets import load_dataset
-from PIL.Image import Image
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase

@ -60,27 +56,20 @@ class BenchmarkMetrics:
    total_input: int
    total_output: int
    request_throughput: float
+    input_throughput: float
    output_throughput: float
-    total_token_throughput: float
    mean_ttft_ms: float
    median_ttft_ms: float
    std_ttft_ms: float
-    percentiles_ttft_ms: List[Tuple[float, float]]
+    p99_ttft_ms: float
    mean_tpot_ms: float
    median_tpot_ms: float
    std_tpot_ms: float
-    percentiles_tpot_ms: List[Tuple[float, float]]
+    p99_tpot_ms: float
    mean_itl_ms: float
    median_itl_ms: float
    std_itl_ms: float
-    percentiles_itl_ms: List[Tuple[float, float]]
-    # E2EL stands for end-to-end latency per request.
-    # It is the time taken on the client side from sending
-    # a request to receiving a complete response.
-    mean_e2el_ms: float
-    median_e2el_ms: float
-    std_e2el_ms: float
-    percentiles_e2el_ms: List[Tuple[float, float]]
+    p99_itl_ms: float


 def sample_sharegpt_requests(
@ -88,9 +77,11 @@ def sample_sharegpt_requests(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, int, int, None]]:
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
    # Load the dataset.
-    with open(dataset_path, encoding='utf-8') as f:
+    with open(dataset_path) as f:
        dataset = json.load(f)
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
@ -115,13 +106,13 @@ def sample_sharegpt_requests(
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
+        if prompt_len < 4 or output_len < 4:
            # Prune too short sequences.
            continue
        if prompt_len > 1024 or prompt_len + output_len > 2048:
            # Prune too long sequences.
            continue
-        filtered_dataset.append((prompt, prompt_len, output_len, None))
+        filtered_dataset.append((prompt, prompt_len, output_len))

    return filtered_dataset

@ -133,13 +124,13 @@ def sample_sonnet_requests(
    output_len: int,
    prefix_len: int,
    tokenizer: PreTrainedTokenizerBase,
-) -> List[Tuple[str, str, int, int, None]]:
+) -> List[Tuple[str, str, int, int]]:
    assert (
        input_len > prefix_len
    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."

    # Load the dataset.
-    with open(dataset_path, encoding='utf-8') as f:
+    with open(dataset_path) as f:
        poem_lines = f.readlines()

    # Tokenize the poem lines.
@ -176,9 +167,9 @@ def sample_sonnet_requests(
    # Sample the rest of lines per request.
    sampled_requests: List[Tuple[str, int, int]] = []
    for _ in range(num_requests):
-        num_lines_needed = num_input_lines - num_prefix_lines
-        sampled_lines = "".join(prefix_lines +
-                                random.choices(poem_lines, k=num_lines_needed))
+        sampled_lines = "".join(
+            prefix_lines +
+            random.sample(poem_lines, num_input_lines - num_prefix_lines))

        prompt = f"{base_prompt}{sampled_lines}"
        message = [
@ -191,81 +182,14 @@ def sample_sonnet_requests(
            message, add_generation_prompt=True, tokenize=False)
        prompt_len = len(tokenizer(prompt_formatted).input_ids)
        sampled_requests.append(
-            (prompt, prompt_formatted, prompt_len, output_len, None))
-
-    return sampled_requests
-
-
-def sample_hf_requests(
-    dataset_path: str,
-    dataset_subset: str,
-    dataset_split: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
-    dataset = load_dataset(dataset_path,
-                           name=dataset_subset,
-                           split=dataset_split,
-                           streaming=True)
-    assert "conversations" in dataset.features, (
-        "HF Dataset must have 'conversations' column.")
-    filtered_dataset = dataset.shuffle().filter(
-        lambda x: len(x["conversations"]) >= 2)
-    sampled_requests: List[Tuple[str, int, int, Dict[str,
-                                                     Collection[str]]]] = []
-    for data in filtered_dataset:
-        if len(sampled_requests) == num_requests:
-            break
-
-        # Tokenize the prompts and completions.
-        prompt = data["conversations"][0]["value"]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = data["conversations"][1]["value"]
-        completion_token_ids = tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
-            # Prune too short sequences.
-            continue
-        if fixed_output_len is None and \
-            (prompt_len > 1024 or prompt_len + output_len > 2048):
-            # Prune too long sequences.
-            continue
-
-        if "image" in data and isinstance(data["image"], Image):
-            image: Image = data["image"]
-            image = image.convert("RGB")
-            image_data = io.BytesIO()
-            image.save(image_data, format='JPEG')
-            image_base64 = base64.b64encode(
-                image_data.getvalue()).decode("utf-8")
-            mm_content = {
-                "type": "image_url",
-                "image_url": {
-                    "url": f"data:image/jpeg;base64,{image_base64}"
-                },
-            }
-        else:
-            mm_content = None
-
-        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
+            (prompt, prompt_formatted, prompt_len, output_len))

    return sampled_requests


 def sample_random_requests(
-    prefix_len: int,
-    input_len: int,
-    output_len: int,
-    num_prompts: int,
-    range_ratio: float,
-    tokenizer: PreTrainedTokenizerBase,
-) -> List[Tuple[str, int, int]]:
-    prefix_token_ids = np.random.randint(0,
-                                         tokenizer.vocab_size,
-                                         size=prefix_len).tolist()
+        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
+        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:

    input_lens = np.random.randint(
        int(input_len * range_ratio),
@ -280,12 +204,10 @@ def sample_random_requests(
    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
    input_requests = []
    for i in range(num_prompts):
-        prompt = tokenizer.decode(prefix_token_ids +
-                                  [(offsets[i] + i + j) % tokenizer.vocab_size
+        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
                                   for j in range(input_lens[i])])
-
-        input_requests.append((prompt, int(prefix_len + input_lens[i]),
-                               int(output_lens[i]), None))
+        input_requests.append(
+            (prompt, int(input_lens[i]), int(output_lens[i])))

    return input_requests

@ -313,8 +235,6 @@ def calculate_metrics(
    outputs: List[RequestFuncOutput],
    dur_s: float,
    tokenizer: PreTrainedTokenizerBase,
-    selected_percentile_metrics: List[str],
-    selected_percentiles: List[float],
 ) -> Tuple[BenchmarkMetrics, List[int]]:
    actual_output_lens: List[int] = []
    total_input = 0
@ -322,7 +242,6 @@ def calculate_metrics(
    itls: List[float] = []
    tpots: List[float] = []
    ttfts: List[float] = []
-    e2els: List[float] = []
    for i in range(len(outputs)):
        if outputs[i].success:
            # We use the tokenizer to count the number of output tokens for all
@ -339,7 +258,6 @@ def calculate_metrics(
                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
            itls += outputs[i].itl
            ttfts.append(outputs[i].ttft)
-            e2els.append(outputs[i].latency)
            completed += 1
        else:
            actual_output_lens.append(0)
@ -354,29 +272,21 @@ def calculate_metrics(
        total_input=total_input,
        total_output=sum(actual_output_lens),
        request_throughput=completed / dur_s,
+        input_throughput=total_input / dur_s,
        output_throughput=sum(actual_output_lens) / dur_s,
-        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
        mean_ttft_ms=np.mean(ttfts or 0) *
        1000,  # ttfts is empty if streaming is not supported by backend
-        std_ttft_ms=np.std(ttfts or 0) * 1000,
        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
-                             for p in selected_percentiles],
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
-        std_tpot_ms=np.std(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
-        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
-                             for p in selected_percentiles],
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
        mean_itl_ms=np.mean(itls or 0) * 1000,
-        std_itl_ms=np.std(itls or 0) * 1000,
        median_itl_ms=np.median(itls or 0) * 1000,
-        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
-                            for p in selected_percentiles],
-        mean_e2el_ms=np.median(e2els or 0) * 1000,
-        std_e2el_ms=np.std(e2els or 0) * 1000,
-        median_e2el_ms=np.mean(e2els or 0) * 1000,
-        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
-                             for p in selected_percentiles],
+        std_itl_ms=np.std(itls or 0) * 1000,
+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
    )

    return metrics, actual_output_lens
@ -385,18 +295,13 @@ def calculate_metrics(
 async def benchmark(
    backend: str,
    api_url: str,
-    base_url: str,
    model_id: str,
    tokenizer: PreTrainedTokenizerBase,
    input_requests: List[Tuple[str, int, int]],
-    logprobs: Optional[int],
    best_of: int,
+    use_beam_search: bool,
    request_rate: float,
    disable_tqdm: bool,
-    profile: bool,
-    selected_percentile_metrics: List[str],
-    selected_percentiles: List[str],
-    ignore_eos: bool,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -404,22 +309,15 @@ async def benchmark(
        raise ValueError(f"Unknown backend: {backend}")

    print("Starting initial single prompt test run...")
-    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
-        input_requests[0])
-    if backend != "openai-chat" and test_mm_content is not None:
-        # multi-modal benchmark is only available on OpenAI Chat backend.
-        raise ValueError(
-            "Multi-modal content is only supported on 'openai-chat' backend.")
+    test_prompt, test_prompt_len, test_output_len = input_requests[0]
    test_input = RequestFuncInput(
        model=model_id,
        prompt=test_prompt,
        api_url=api_url,
        prompt_len=test_prompt_len,
        output_len=test_output_len,
-        logprobs=logprobs,
        best_of=best_of,
-        multi_modal_content=test_mm_content,
-        ignore_eos=ignore_eos,
+        use_beam_search=use_beam_search,
    )
    test_output = await request_func(request_func_input=test_input)
    if not test_output.success:
@ -428,22 +326,6 @@ async def benchmark(
            f"are correctly specified. Error: {test_output.error}")
    else:
        print("Initial test run completed. Starting main benchmark run...")
-
-    if profile:
-        print("Starting profiler...")
-        profile_input = RequestFuncInput(model=model_id,
-                                         prompt=test_prompt,
-                                         api_url=base_url + "/start_profile",
-                                         prompt_len=test_prompt_len,
-                                         output_len=test_output_len,
-                                         logprobs=logprobs,
-                                         best_of=best_of,
-                                         multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos)
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler started")
-
    print(f"Traffic request rate: {request_rate}")

    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
@ -451,37 +333,22 @@ async def benchmark(
    benchmark_start_time = time.perf_counter()
    tasks: List[asyncio.Task] = []
    async for request in get_request(input_requests, request_rate):
-        prompt, prompt_len, output_len, mm_content = request
-        request_func_input = RequestFuncInput(model=model_id,
-                                              prompt=prompt,
-                                              api_url=api_url,
-                                              prompt_len=prompt_len,
-                                              output_len=output_len,
-                                              logprobs=logprobs,
-                                              best_of=best_of,
-                                              multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos)
+        prompt, prompt_len, output_len = request
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            best_of=best_of,
+            use_beam_search=use_beam_search,
+        )
        tasks.append(
            asyncio.create_task(
                request_func(request_func_input=request_func_input,
                             pbar=pbar)))
    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

-    if profile:
-        print("Stopping profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/stop_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            logprobs=logprobs,
-            best_of=best_of,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler stopped")
-
    if pbar is not None:
        pbar.close()

@ -492,8 +359,6 @@ async def benchmark(
        outputs=outputs,
        dur_s=benchmark_duration,
        tokenizer=tokenizer,
-        selected_percentile_metrics=selected_percentile_metrics,
-        selected_percentiles=selected_percentiles,
    )

    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@ -505,10 +370,27 @@ async def benchmark(
                                 metrics.total_output))
    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                    metrics.request_throughput))
+    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
+                                    metrics.input_throughput))
    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
                                    metrics.output_throughput))
-    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
-                                    metrics.total_token_throughput))
+    print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+    print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
+                                    metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+    print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
+                               n=50,
+                               c='-'))
+    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
+                                    metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
+    print("=" * 50)

    result = {
        "duration": benchmark_duration,
@ -516,8 +398,20 @@ async def benchmark(
        "total_input_tokens": metrics.total_input,
        "total_output_tokens": metrics.total_output,
        "request_throughput": metrics.request_throughput,
+        "input_throughput": metrics.input_throughput,
        "output_throughput": metrics.output_throughput,
-        "total_token_throughput": metrics.total_token_throughput,
+        "mean_ttft_ms": metrics.mean_ttft_ms,
+        "median_ttft_ms": metrics.median_ttft_ms,
+        "std_ttft_ms": metrics.std_ttft_ms,
+        "p99_ttft_ms": metrics.p99_ttft_ms,
+        "mean_tpot_ms": metrics.mean_tpot_ms,
+        "median_tpot_ms": metrics.median_tpot_ms,
+        "std_tpot_ms": metrics.std_tpot_ms,
+        "p99_tpot_ms": metrics.p99_tpot_ms,
+        "mean_itl_ms": metrics.mean_itl_ms,
+        "median_itl_ms": metrics.median_itl_ms,
+        "std_itl_ms": metrics.std_itl_ms,
+        "p99_itl_ms": metrics.p99_itl_ms,
        "input_lens": [output.prompt_len for output in outputs],
        "output_lens": actual_output_lens,
        "ttfts": [output.ttft for output in outputs],
@ -525,47 +419,6 @@ async def benchmark(
        "generated_texts": [output.generated_text for output in outputs],
        "errors": [output.error for output in outputs],
    }
-
-    def process_one_metric(
-        # E.g., "ttft"
-        metric_attribute_name: str,
-        # E.g., "TTFT"
-        metric_name: str,
-        # E.g., "Time to First Token"
-        metric_header: str,
-    ):
-        # This function prints and adds statistics of the specified
-        # metric.
-        if metric_attribute_name not in selected_percentile_metrics:
-            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
-        print("{:<40} {:<10.2f}".format(
-            f"Mean {metric_name} (ms):",
-            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
-        print("{:<40} {:<10.2f}".format(
-            f"Median {metric_name} (ms):",
-            getattr(metrics, f"median_{metric_attribute_name}_ms")))
-        result[f"mean_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"mean_{metric_attribute_name}_ms")
-        result[f"median_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"median_{metric_attribute_name}_ms")
-        result[f"std_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"std_{metric_attribute_name}_ms")
-        for p, value in getattr(metrics,
-                                f"percentiles_{metric_attribute_name}_ms"):
-            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
-                                            value))
-            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
-
-    process_one_metric("ttft", "TTFT", "Time to First Token")
-    process_one_metric("tpot", "TPOT",
-                       "Time per Output Token (excl. 1st token)")
-    process_one_metric("itl", "ITL", "Inter-token Latency")
-    process_one_metric("e2el", "E2EL", "End-to-end Latency")
-
-    print("=" * 50)
-
    return result


@ -580,10 +433,8 @@ def main(args: argparse.Namespace):

    if args.base_url is not None:
        api_url = f"{args.base_url}{args.endpoint}"
-        base_url = f"{args.base_url}"
    else:
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
-        base_url = f"http://{args.host}:{args.port}"

    tokenizer = get_tokenizer(tokenizer_id,
                              trust_remote_code=args.trust_remote_code)
@ -620,9 +471,9 @@ def main(args: argparse.Namespace):
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
            )
-            input_requests = [(prompt, prompt_len, output_len, None)
+            input_requests = [(prompt, prompt_len, output_len)
                              for prompt, prompt_formatted, prompt_len,
-                              output_len, _ in input_requests]
+                              output_len in input_requests]
        else:
            assert (
                tokenizer.chat_template or tokenizer.default_chat_template
@ -635,23 +486,12 @@ def main(args: argparse.Namespace):
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
            )
-            input_requests = [(prompt_formatted, prompt_len, output_len, None)
+            input_requests = [(prompt_formatted, prompt_len, output_len)
                              for prompt, prompt_formatted, prompt_len,
-                              output_len, _ in input_requests]
-
-    elif args.dataset_name == "hf":
-        input_requests = sample_hf_requests(
-            dataset_path=args.dataset_path,
-            dataset_subset=args.hf_subset,
-            dataset_split=args.hf_split,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            fixed_output_len=args.hf_output_len,
-        )
+                              output_len in input_requests]

    elif args.dataset_name == "random":
        input_requests = sample_random_requests(
-            prefix_len=args.random_prefix_len,
            input_len=args.random_input_len,
            output_len=args.random_output_len,
            num_prompts=args.num_prompts,
@ -666,20 +506,13 @@ def main(args: argparse.Namespace):
        benchmark(
            backend=backend,
            api_url=api_url,
-            base_url=base_url,
            model_id=model_id,
            tokenizer=tokenizer,
            input_requests=input_requests,
-            logprobs=args.logprobs,
            best_of=args.best_of,
+            use_beam_search=args.use_beam_search,
            request_rate=args.request_rate,
            disable_tqdm=args.disable_tqdm,
-            profile=args.profile,
-            selected_percentile_metrics=args.percentile_metrics.split(","),
-            selected_percentiles=[
-                float(p) for p in args.metric_percentiles.split(",")
-            ],
-            ignore_eos=args.ignore_eos,
        ))

    # Save config and results to json
@ -693,6 +526,7 @@ def main(args: argparse.Namespace):
        result_json["model_id"] = model_id
        result_json["tokenizer_id"] = tokenizer_id
        result_json["best_of"] = args.best_of
+        result_json["use_beam_search"] = args.use_beam_search
        result_json["num_prompts"] = args.num_prompts

        # Metadata
@ -720,7 +554,7 @@ def main(args: argparse.Namespace):
            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name, "w", encoding='utf-8') as outfile:
+        with open(file_name, "w") as outfile:
            json.dump(result_json, outfile)


@ -758,14 +592,13 @@ if __name__ == "__main__":
        "--dataset-name",
        type=str,
        default="sharegpt",
-        choices=["sharegpt", "sonnet", "random", "hf"],
+        choices=["sharegpt", "sonnet", "random"],
        help="Name of the dataset to benchmark on.",
    )
    parser.add_argument("--dataset-path",
                        type=str,
                        default=None,
-                        help="Path to the sharegpt/sonnet dataset. "
-                        "Or the huggingface dataset ID if using HF dataset.")
+                        help="Path to the dataset.")
    parser.add_argument(
        "--model",
        type=str,
@ -793,14 +626,52 @@ if __name__ == "__main__":
        help="Number of prompts to process.",
    )
    parser.add_argument(
-        "--logprobs",
+        "--sharegpt-output-len",
        type=int,
        default=None,
-        help=("Number of logprobs-per-token to compute & return as part of "
-              "the request. If unspecified, then either (1) if beam search "
-              "is disabled, no logprobs are computed & a single dummy "
-              "logprob is returned for each token; or (2) if beam search "
-              "is enabled 1 logprob per token is computed"),
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.")
+    parser.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help=
+        "Number of input tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help=
+        "Number of output tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help=
+        "Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help=
+        "Number of input tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help=
+        "Number of output tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
    )
    parser.add_argument(
        "--request-rate",
@ -822,12 +693,6 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to disable tqdm progress bar.",
    )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
-    )
    parser.add_argument(
        "--save-result",
        action="store_true",
@ -857,108 +722,6 @@ if __name__ == "__main__":
        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
        " format.",
    )
-    parser.add_argument(
-        "--ignore-eos",
-        action="store_true",
-        help="Set ignore_eos flag when sending the benchmark request."
-        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
-    parser.add_argument(
-        "--percentile-metrics",
-        type=str,
-        default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
-        "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
-        "Default value is \"ttft,tpot,itl\".")
-    parser.add_argument(
-        "--metric-percentiles",
-        type=str,
-        default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
-        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
-        "Default value is \"99\". "
-        "Use \"--percentile-metrics\" to select metrics.",
-    )
-
-    # group for dataset specific arguments
-    sonnet_group = parser.add_argument_group("sonnet dataset options")
-    sonnet_group.add_argument(
-        "--sonnet-input-len",
-        type=int,
-        default=550,
-        help=
-        "Number of input tokens per request, used only for sonnet dataset.",
-    )
-    sonnet_group.add_argument(
-        "--sonnet-output-len",
-        type=int,
-        default=150,
-        help=
-        "Number of output tokens per request, used only for sonnet dataset.",
-    )
-    sonnet_group.add_argument(
-        "--sonnet-prefix-len",
-        type=int,
-        default=200,
-        help=
-        "Number of prefix tokens per request, used only for sonnet dataset.",
-    )
-
-    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
-    sharegpt_group.add_argument(
-        "--sharegpt-output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the output length "
-        "from the ShareGPT dataset.")
-
-    random_group = parser.add_argument_group("random dataset options")
-    random_group.add_argument(
-        "--random-input-len",
-        type=int,
-        default=1024,
-        help=
-        "Number of input tokens per request, used only for random sampling.",
-    )
-    random_group.add_argument(
-        "--random-output-len",
-        type=int,
-        default=128,
-        help=
-        "Number of output tokens per request, used only for random sampling.",
-    )
-    random_group.add_argument(
-        "--random-range-ratio",
-        type=float,
-        default=1.0,
-        help="Range of sampled ratio of input/output length, "
-        "used only for random sampling.",
-    )
-    random_group.add_argument(
-        "--random-prefix-len",
-        type=int,
-        default=0,
-        help="Number of fixed prefix tokens before random "
-        " context. The length range of context in a random "
-        " request is [random-prefix-len, "
-        " random-prefix-len + random-prefix-len * random-range-ratio).")
-
-    hf_group = parser.add_argument_group("hf dataset options")
-    hf_group.add_argument("--hf-subset",
-                          type=str,
-                          default=None,
-                          help="Subset of the HF dataset.")
-    hf_group.add_argument("--hf-split",
-                          type=str,
-                          default=None,
-                          help="Split of the HF dataset.")
-    hf_group.add_argument(
-        "--hf-output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the output lengths "
-        "from the sampled HF dataset.",
-    )

    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -6,17 +6,13 @@ import time
 from typing import List, Optional, Tuple

 import torch
-import uvloop
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)

-from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
+from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+from vllm.utils import FlexibleArgumentParser


 def sample_requests(
@ -73,6 +69,7 @@ def run_vllm(
    tensor_parallel_size: int,
    seed: int,
    n: int,
+    use_beam_search: bool,
    trust_remote_code: bool,
    dtype: str,
    max_model_len: Optional[int],
@ -85,10 +82,8 @@ def run_vllm(
    max_num_batched_tokens: int,
    distributed_executor_backend: Optional[str],
    gpu_memory_utilization: float = 0.9,
-    num_scheduler_steps: int = 1,
    download_dir: Optional[str] = None,
    load_format: str = EngineArgs.load_format,
-    disable_async_output_proc: bool = False,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
@ -111,8 +106,6 @@ def run_vllm(
        max_num_batched_tokens=max_num_batched_tokens,
        distributed_executor_backend=distributed_executor_backend,
        load_format=load_format,
-        num_scheduler_steps=num_scheduler_steps,
-        disable_async_output_proc=disable_async_output_proc,
    )

    # Add the requests to the engine.
@ -123,126 +116,29 @@ def run_vllm(
        sampling_params.append(
            SamplingParams(
                n=n,
-                temperature=1.0,
+                temperature=0.0 if use_beam_search else 1.0,
                top_p=1.0,
+                use_beam_search=use_beam_search,
                ignore_eos=True,
                max_tokens=output_len,
            ))

-    use_beam_search = False
-
-    if not use_beam_search:
-        start = time.perf_counter()
-        llm.generate(prompts, sampling_params, use_tqdm=True)
-        end = time.perf_counter()
-    else:
-        prompts = [prompt for prompt, _, _ in requests]
-        # output_len should be the same for all requests.
-        output_len = requests[0][2]
-        for prompt, input_len, _output_len in requests:
-            assert _output_len == output_len
-        start = time.perf_counter()
-        llm.beam_search(
-            prompts,
-            BeamSearchParams(
-                beam_width=n,
-                max_tokens=output_len,
-                ignore_eos=True,
-            ))
-        end = time.perf_counter()
+    start = time.perf_counter()
+    llm.generate(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
    return end - start


-async def run_vllm_async(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: str,
-    quantization: Optional[str],
-    tensor_parallel_size: int,
-    seed: int,
-    n: int,
-    trust_remote_code: bool,
-    dtype: str,
-    max_model_len: Optional[int],
-    enforce_eager: bool,
-    kv_cache_dtype: str,
-    quantization_param_path: Optional[str],
-    device: str,
-    enable_prefix_caching: bool,
-    enable_chunked_prefill: bool,
-    max_num_batched_tokens: int,
-    distributed_executor_backend: Optional[str],
-    gpu_memory_utilization: float = 0.9,
-    num_scheduler_steps: int = 1,
-    download_dir: Optional[str] = None,
-    load_format: str = EngineArgs.load_format,
-    disable_async_output_proc: bool = False,
-    disable_frontend_multiprocessing: bool = False,
-) -> float:
-    from vllm import SamplingParams
-    engine_args = AsyncEngineArgs(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        quantization_param_path=quantization_param_path,
-        device=device,
-        enable_prefix_caching=enable_prefix_caching,
-        download_dir=download_dir,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
-        load_format=load_format,
-        num_scheduler_steps=num_scheduler_steps,
-        disable_async_output_proc=disable_async_output_proc,
-        worker_use_ray=False,
-        disable_log_requests=True,
-    )
-
-    async with build_async_engine_client_from_engine_args(
-            engine_args, disable_frontend_multiprocessing) as llm:
-
-        # Add the requests to the engine.
-        prompts: List[str] = []
-        sampling_params: List[SamplingParams] = []
-        for prompt, _, output_len in requests:
-            prompts.append(prompt)
-            sampling_params.append(
-                SamplingParams(
-                    n=n,
-                    temperature=1.0,
-                    top_p=1.0,
-                    ignore_eos=True,
-                    max_tokens=output_len,
-                ))
-
-        generators = []
-        start = time.perf_counter()
-        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
-            generator = llm.generate(prompt, sp, request_id=f"test{i}")
-            generators.append(generator)
-        all_gens = merge_async_iterators(*generators)
-        async for i, res in all_gens:
-            pass
-        end = time.perf_counter()
-        return end - start
-
-
 def run_hf(
    requests: List[Tuple[str, int, int]],
    model: str,
    tokenizer: PreTrainedTokenizerBase,
    n: int,
+    use_beam_search: bool,
    max_batch_size: int,
    trust_remote_code: bool,
 ) -> float:
+    assert not use_beam_search
    llm = AutoModelForCausalLM.from_pretrained(
        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
    if llm.config.model_type == "llama":
@ -274,7 +170,7 @@ def run_hf(
                              padding=True).input_ids
        llm_outputs = llm.generate(
            input_ids=input_ids.cuda(),
-            do_sample=True,
+            do_sample=not use_beam_search,
            num_return_sequences=n,
            temperature=1.0,
            top_p=1.0,
@ -328,27 +224,20 @@ def main(args: argparse.Namespace):
                                   args.output_len)

    if args.backend == "vllm":
-        run_args = [
+        elapsed_time = run_vllm(
            requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
            args.trust_remote_code, args.dtype, args.max_model_len,
            args.enforce_eager, args.kv_cache_dtype,
            args.quantization_param_path, args.device,
            args.enable_prefix_caching, args.enable_chunked_prefill,
            args.max_num_batched_tokens, args.distributed_executor_backend,
-            args.gpu_memory_utilization, args.num_scheduler_steps,
-            args.download_dir, args.load_format, args.disable_async_output_proc
-        ]
-
-        if args.async_engine:
-            run_args.append(args.disable_frontend_multiprocessing)
-            elapsed_time = uvloop.run(run_vllm_async(*run_args))
-        else:
-            elapsed_time = run_vllm(*run_args)
+            args.gpu_memory_utilization, args.download_dir, args.load_format)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.hf_max_batch_size, args.trust_remote_code)
+                              args.use_beam_search, args.hf_max_batch_size,
+                              args.trust_remote_code)
    elif args.backend == "mii":
        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                               args.output_len)
@ -402,6 +291,7 @@ if __name__ == "__main__":
                        type=int,
                        default=1,
                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument("--num-prompts",
                        type=int,
                        default=1000,
@ -456,20 +346,17 @@ if __name__ == "__main__":
        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
        'instead supported for common inference criteria.')
-    parser.add_argument("--device",
-                        type=str,
-                        default="auto",
-                        choices=DEVICE_OPTIONS,
-                        help='device type for vLLM execution')
    parser.add_argument(
-        "--num-scheduler-steps",
-        type=int,
-        default=1,
-        help="Maximum number of forward steps per scheduler call.")
+        "--device",
+        type=str,
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
    parser.add_argument(
        "--enable-prefix-caching",
        action='store_true',
-        help="Enable automatic prefix caching for vLLM backend.")
+        help="enable automatic prefix caching for vLLM backend.")
    parser.add_argument("--enable-chunked-prefill",
                        action='store_true',
                        help="enable chunked prefill for vLLM backend.")
@ -518,19 +405,6 @@ if __name__ == "__main__":
        'section for more information.\n'
        '* "bitsandbytes" will load the weights using bitsandbytes '
        'quantization.\n')
-    parser.add_argument(
-        "--disable-async-output-proc",
-        action='store_true',
-        default=False,
-        help="Disable async output processor for vLLM backend.")
-    parser.add_argument("--async-engine",
-                        action='store_true',
-                        default=False,
-                        help="Use vLLM async engine rather than LLM class.")
-    parser.add_argument("--disable-frontend-multiprocessing",
-                        action='store_true',
-                        default=False,
-                        help="Disable decoupled async engine frontend.")
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
@ -553,6 +427,8 @@ if __name__ == "__main__":
            raise ValueError("dtype must be auto for MII backend.")
        if args.n != 1:
            raise ValueError("n must be 1 for MII backend.")
+        if args.use_beam_search:
+            raise ValueError("Beam search is not supported for MII backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
        if args.hf_max_batch_size is not None:
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -13,7 +13,7 @@ from weight_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
 from vllm.utils import FlexibleArgumentParser

-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]

@ -32,6 +32,7 @@ def to_int8(tensor: torch.Tensor) -> torch.Tensor:

 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+
    a = torch.randn((m, k), device='cuda') * 5
    b = torch.randn((n, k), device='cuda').t() * 5

@ -43,18 +44,59 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
    raise ValueError("unsupported dtype")


+# impl
+
+
+def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                    scale_b: torch.Tensor,
+                    out_dtype: torch.dtype) -> torch.Tensor:
+    return torch.mm(a, b)
+
+
+def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                     scale_b: torch.Tensor,
+                     out_dtype: torch.dtype) -> torch.Tensor:
+    return torch._scaled_mm(a,
+                            b,
+                            scale_a=scale_a,
+                            scale_b=scale_b,
+                            out_dtype=out_dtype)
+
+
+def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
+                                scale_a: torch.Tensor, scale_b: torch.Tensor,
+                                out_dtype: torch.dtype) -> torch.Tensor:
+    return torch._scaled_mm(a,
+                            b,
+                            scale_a=scale_a,
+                            scale_b=scale_b,
+                            out_dtype=out_dtype,
+                            use_fast_accum=True)
+
+
+def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                 scale_b: torch.Tensor,
+                 out_dtype: torch.dtype) -> torch.Tensor:
+    return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
+
+
 # bench
-def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             **kwargs) -> TMeasurement:
+def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+             scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
+             sub_label: str, fn: Callable, description: str) -> TMeasurement:
+
    min_run_time = 1

    globals = {
-        "args": args,
-        "kwargs": kwargs,
+        "a": a,
+        "b": b,
+        "scale_a": scale_a,
+        "scale_b": scale_b,
+        "out_dtype": out_dtype,
        "fn": fn,
    }
    return TBenchmark.Timer(
-        stmt="fn(*args, **kwargs)",
+        stmt="fn(a, b, scale_a, scale_b, out_dtype)",
        globals=globals,
        label=label,
        sub_label=sub_label,
@ -68,58 +110,19 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
    a, b = make_rand_tensors(torch.int8, m, n, k)
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
-    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)

    timers = []
-    # pytorch impl - bfloat16
+    # pytorch impl
    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
+                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))

    # cutlass impl
    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-
-    # cutlass with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass with azp per-tensor
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj))
-
-    # cutlass with azp per-tensor + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, None, bias))
-
-    # cutlass with azp per-token
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp))
-
-    # cutlass with azp per-token + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp, bias))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))

    return timers

@ -130,88 +133,46 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

    timers = []

    # pytorch impl w. bf16
    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
+        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
+                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))

    # pytorch impl: bf16 output, without fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))

    # pytorch impl: bf16 output, with fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 pytorch_fp8_impl_fast_accum,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))

    # pytorch impl: fp16 output, without fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))

    # pytorch impl: fp16 output, with fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 pytorch_fp8_impl_fast_accum,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))

    # cutlass impl: bf16 output
    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
    # cutlass impl: fp16 output
    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
-
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
    return timers


@ -232,6 +193,7 @@ def print_timers(timers: Iterable[TMeasurement]):

 def run(dtype: torch.dtype,
        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+
    results = []
    for m, k, n in MKNs:
        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
@ -247,6 +209,7 @@ def make_output(data: Iterable[TMeasurement],
                MKNs: Iterable[Tuple[int, int, int]],
                base_description: str,
                timestamp=None):
+
    print(f"== All Results {base_description} ====")
    print_timers(data)

@ -281,6 +244,7 @@ def run_range_bench(args):


 def run_model_bench(args):
+
    print("Benchmarking models:")
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -1,86 +0,0 @@
-import time
-
-import torch
-
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
-                        seed_everything)
-
-
-@torch.inference_mode()
-def main(num_tokens: int,
-         hidden_size: int,
-         add_residual: bool,
-         dtype: torch.dtype,
-         seed: int = 0,
-         do_profile: bool = False,
-         num_warmup_iters: int = 5,
-         num_iters: int = 100) -> None:
-    seed_everything(seed)
-    torch.set_default_device("cuda")
-
-    layer = RMSNorm(hidden_size).to(dtype=dtype)
-    layer.weight.data.normal_(mean=1.0, std=0.1)
-    scale = 1 / (2 * hidden_size)
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    x *= scale
-    residual = torch.randn_like(x) * scale if add_residual else None
-
-    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        start_time = time.perf_counter()
-
-        for _ in range(num_iters):
-            layer(x, residual)
-        torch.cuda.synchronize()
-
-        end_time = time.perf_counter()
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        return (end_time - start_time) / num_iters
-
-    # Warmup.
-    print("Warming up...")
-    run_benchmark = run_cuda_benchmark
-    run_benchmark(num_iters=num_warmup_iters, profile=False)
-
-    # Benchmark.
-    if do_profile:
-        latency = run_benchmark(num_iters=1, profile=True)
-    else:
-        latency = run_benchmark(num_iters=num_iters, profile=False)
-    print(f"Kernel running time: {latency * 1000000:.3f} us")
-
-
-if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
-        description="Benchmark the layernorm kernel.")
-    parser.add_argument("--num-tokens", type=int, default=4096)
-    parser.add_argument("--hidden-size", type=int, default=8192)
-    parser.add_argument("--add-residual", action="store_true")
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["half", "bfloat16", "float"],
-                        default="half")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--profile", action="store_true")
-    parser.add_argument("--num-warmup-iters", type=int, default=5)
-    parser.add_argument("--num-iters",
-                        type=int,
-                        default=100,
-                        help="Number of benchmark iterations. "
-                        "If --profile is set, this number is ignored")
-
-    args = parser.parse_args()
-    print(args)
-
-    main(num_tokens=args.num_tokens,
-         hidden_size=args.hidden_size,
-         add_residual=args.add_residual,
-         dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
-         seed=args.seed,
-         do_profile=args.profile,
-         num_warmup_iters=args.num_warmup_iters,
-         num_iters=args.num_iters)
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -1,420 +0,0 @@
-import argparse
-import copy
-import itertools
-import math
-import pickle as pkl
-import time
-from itertools import product
-from typing import Callable, Iterable, List, Optional, Tuple
-
-import pandas as pd
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales)
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    MarlinWorkspace)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, pack_rows, quantize_weights)
-from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
-DEFAULT_TP_SIZES = [1]
-
-
-def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor:
-    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
-    w_q = w_q.t().contiguous().t()  # make col major
-    return ops.machete_prepack_B(w_q, wtype)
-
-
-def make_bench_tensors(
-    atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int,
-    k: int
-) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor,
-                                    torch.tensor]]]:
-    assert wtype.is_integer(), "TODO: support floating point weights"
-
-    # we want to make sure that weights don't fit into L2 cache between runs so
-    #  we construct enough weights to exceed L2 cache, which is 50mb on a H100
-    #  so we target total weight size > 2*50mb
-    num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits))
-
-    a = torch.randn((m, k), device="cuda", dtype=atype) * 5
-    weights = [
-        torch.randn((k, n), device="cuda", dtype=atype)
-        for _ in range(num_weights)
-    ]
-    quanitized_weights = [
-        quantize_weights(w, wtype, group_size) for w in weights
-    ]
-
-    return a, quanitized_weights
-
-
-# impl
-
-
-# bench
-def bench_fn(label: str, sub_label: str, description: str,
-             fn: Callable) -> TMeasurement:
-
-    min_run_time = 1
-    return TBenchmark.Timer(
-        stmt="fn()",
-        globals={
-            "fn": fn
-        },
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-
-def loop_over_weights(
-    a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor,
-                                         torch.tensor, torch.tensor]],
-    fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
-                 None]):
-    for w_ref, w_q, w_s, _ in weights:
-        fn(a, w_ref, w_q, w_s)
-
-
-_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
-_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
-
-
-def bench(atype: torch.dtype,
-          wtype: ScalarType,
-          group_size: int,
-          m: int,
-          k: int,
-          n: int,
-          label: str,
-          sub_label: str,
-          benchmark_marlinv1: bool = True,
-          sweep_schedules: bool = True) -> Iterable[TMeasurement]:
-    global _SWEEP_SCHEDULES_RESULTS
-
-    a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
-    sub_label += f", L={len(weights)}"
-
-    weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp)
-                       for w_ref, w_q, w_s, w_zp in weights]
-
-    timers = []
-    # pytorch impl
-    timers.append(
-        bench_fn(
-            label, sub_label, "torch.matmul", lambda: loop_over_weights(
-                a,
-                weights,
-                lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref),
-            )))
-
-    if benchmark_marlinv1:
-        w_ref = weights[0][0]
-
-        w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device)
-        sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device)
-        g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device)
-
-        def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor:
-            w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape)
-            return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape,
-                                          wtype.size_bits)
-
-        def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
-            return marlin_permute_scales(w_s, *w_ref.shape, group_size)
-
-        weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q),
-                             marlinv1_permute_scales(w_s), w_zp)
-                            for w_ref, w_q, w_s, w_zp in weights]
-
-        workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
-                                    GPTQ_MARLIN_MAX_PARALLEL)
-
-        # marlinv1
-        timers.append(
-            bench_fn(
-                label, sub_label, "marlin_orig", lambda: loop_over_weights(
-                    a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops.
-                    gptq_marlin_gemm(a,
-                                     w_q,
-                                     w_s,
-                                     w_zp_empty,
-                                     g_idx,
-                                     sort_indices,
-                                     workspace.scratch,
-                                     wtype,
-                                     size_m=a.shape[0],
-                                     size_n=w_ref.shape[1],
-                                     size_k=w_ref.shape[0],
-                                     is_k_full=True))))
-
-    # machete
-    timers.append(
-        bench_fn(
-            label, sub_label, "machete_heuristic", lambda: loop_over_weights(
-                a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm(
-                    a, w_q, wtype, b_scales=w_s, b_group_size=group_size))))
-
-    if sweep_schedules:
-        print("Finding best schedule for machete")
-        best = None
-        best_schedule = None
-        schedules = ops.machete_supported_schedules(wtype)
-        for schedule in reversed(schedules):
-            schedule_M = int(schedule.split("_")[0].split("x")[1])
-
-            # Prune known bad schedules
-            if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
-                continue
-
-            def run(a, _, w_q, w_s, schedule=schedule):
-                ops.machete_gemm(a,
-                                 w_q,
-                                 wtype,
-                                 w_s,
-                                 b_group_size=group_size,
-                                 schedule=schedule)
-
-            res = bench_fn(label, sub_label, "machete_best",
-                           lambda: loop_over_weights(a, weights_machete, run))
-
-            results_row = {
-                "M": m,
-                "K": k,
-                "N": n,
-                "group_size": group_size,
-                "schedule": schedule,
-                "median": res.median,
-            }
-            if _SWEEP_SCHEDULES_RESULTS is None:
-                _SWEEP_SCHEDULES_RESULTS = pd.DataFrame(
-                    columns=results_row.keys())
-            _SWEEP_SCHEDULES_RESULTS.\
-                loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
-
-            print(f"  {res.median:5.5} ", schedule)
-            if not best or res.median < best.median:
-                best = res
-                best_schedule = schedule
-        print("Best schedule:", best_schedule)
-        timers.append(best)
-
-    return timers
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(dtype: torch.dtype, sweep_schedules: bool,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-
-    results = []
-    for m, k, n in MKNs:
-        timers = bench(dtype,
-                       scalar_types.uint4b8,
-                       128,
-                       m,
-                       k,
-                       n,
-                       f"{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})",
-                       sweep_schedules=sweep_schedules)
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(
-    data: Iterable[TMeasurement],
-    MKNs: Iterable[Tuple[int, int, int]],
-    base_description: str,
-    timestamp=None,
-):
-
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-
-    data = run(args.dtype, args.sweep_schedules, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
-    m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
-    m_increment, k_increment, n_increment = \
-        [int(x) for x in args.dim_increment.split(",")]
-    Ms = list(range(m_start, m_end + 1, m_increment))
-    Ks = list(range(k_start, k_end + 1, k_increment))
-    Ns = list(range(n_start, n_end + 1, n_increment))
-    MKNs = list(product(Ms, Ks, Ns))
-
-    data = run(args.dtype, args.sweep_schedules, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args.dtype, args.sweep_schedules, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == "__main__":
-
-    def to_torch_dtype(dt):
-        if dt == "bfloat16":
-            return torch.bfloat16
-        if dt == "float16":
-            return torch.float16
-        raise ValueError("unsupported dtype")
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark Machete GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-
-    parser.add_argument(
-        "--dtype",
-        type=to_torch_dtype,
-        required=True,
-        help="Available options are ['bfloat16', 'float16']",
-    )
-    parser.add_argument(
-        "--sweep-schedules",
-        action="store_true",
-        help="Run a sweep over all supported schedules",
-    )
-    parser.add_argument("--sweep-csv-out",
-                        help="CSV to store sweep results",
-                        default="sch_sweep_results.csv")
-    subparsers = parser.add_subparsers(dest="cmd", required=True)
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument(
-        "--dim-start",
-        type=str,
-        required=True,
-        help="Start value for M,K,N as common separated list")
-    range_parser.add_argument(
-        "--dim-end",
-        type=str,
-        required=True,
-        help="End value (inclusive) for M,K,N as common separated list")
-    range_parser.add_argument(
-        "--dim-increment",
-        type=str,
-        required=True,
-        help="Increment value for M,K,N as common separated list")
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=DEFAULT_MODELS,
-        choices=WEIGHT_SHAPES.keys(),
-    )
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-
-    _SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
-    args.func(args)
-
-    if _SWEEP_SCHEDULES_RESULTS is not None:
-        _SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -7,17 +7,16 @@ from benchmark_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    MarlinWorkspace, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
    marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, gptq_quantize_weights, sort_weights)
-from vllm.scalar_type import ScalarType
+    gptq_pack, quantize_weights, sort_weights)
 from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@ -28,14 +27,13 @@ K_FULL_OPTS = [False, True]


 def bench_run(results: List[benchmark.Measurement], model: str,
-              act_order: bool, is_k_full: bool, quant_type: ScalarType,
-              group_size: int, size_m: int, size_k: int, size_n: int):
+              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
+              size_m: int, size_k: int, size_n: int):
    label = "Quant Matmul"

-    sub_label = ("{}, act={} k_full={}, q={}, g={}, "
-                 "MKN=({}x{}x{})".format(model, act_order, is_k_full,
-                                         str(quant_type), group_size, size_m,
-                                         size_k, size_n))
+    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
+                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
+                                         group_size, size_m, size_k, size_n))

    print(f"Testing: {sub_label}")

@ -52,18 +50,16 @@ def bench_run(results: List[benchmark.Measurement], model: str,
        marlin_g_idx,
        marlin_sort_indices,
        marlin_rand_perm,
-    ) = marlin_quantize(b, quant_type, group_size, act_order)
+    ) = marlin_quantize(b, num_bits, group_size, act_order)

    # Marlin_24 quant
    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b, quant_type, group_size)
-
-    marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
+     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)

    # GPTQ quant
    (w_ref, q_w, s, g_idx,
-     rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order)
-    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
+     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
+    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)

    # For act_order, sort the "weights" and "g_idx"
    # so that group ids are increasing
@ -77,11 +73,10 @@ def bench_run(results: List[benchmark.Measurement], model: str,

    marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
                                          GPTQ_MARLIN_24_MAX_PARALLEL)
-    marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)

    globals = {
        # Gen params
-        "quant_type": quant_type,
+        "num_bits": num_bits,
        "group_size": group_size,
        "size_m": size_m,
        "size_n": size_n,
@ -92,7 +87,6 @@ def bench_run(results: List[benchmark.Measurement], model: str,
        "marlin_w_ref": marlin_w_ref,
        "marlin_q_w": marlin_q_w,
        "marlin_s": marlin_s,
-        "marlin_zp": marlin_zp,
        "marlin_g_idx": marlin_g_idx,
        "marlin_sort_indices": marlin_sort_indices,
        "marlin_rand_perm": marlin_rand_perm,
@ -131,29 +125,19 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    results.append(
        benchmark.Timer(
            stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="gptq_marlin_gemm_fp16",
+            description="gptq_marlin_gemm",
        ).blocked_autorange(min_run_time=min_run_time))

-    results.append(
-        benchmark.Timer(
-            stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="gptq_marlin_gemm_fp32",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
+    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
            and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
        results.append(
            benchmark.Timer(
                stmt=
-                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
+                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
                globals=globals,
                label=label,
                sub_label=sub_label,
@ -163,7 +147,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    results.append(
        benchmark.Timer(
            stmt=
-            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
+            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@ -199,13 +183,12 @@ def main(args):
                           ) > 0 and is_k_full not in args.limit_k_full:
                        continue

-                    for quant_type in query_marlin_supported_quant_types(
-                            False):
-                        if len(args.limit_num_bits) > 0 and \
-                            quant_type.size_bits not in args.limit_num_bits:
+                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+                        if len(args.limit_num_bits
+                               ) > 0 and num_bits not in args.limit_num_bits:
                            continue

-                        for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
+                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
                            if len(
                                    args.limit_group_size
                            ) > 0 and group_size not in args.limit_group_size:
@ -219,8 +202,8 @@ def main(args):

                            for size_m in args.batch_sizes:
                                bench_run(results, model, act_order, is_k_full,
-                                          quant_type, group_size, size_m,
-                                          size_k, size_n)
+                                          num_bits, group_size, size_m, size_k,
+                                          size_n)

    compare = benchmark.Compare(results)
    compare.print()
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -10,7 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
 from transformers import AutoConfig

 from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.utils import FlexibleArgumentParser, seed_everything
+from vllm.utils import FlexibleArgumentParser


 class BenchmarkConfig(TypedDict):
@ -30,36 +30,19 @@ def benchmark_config(
    hidden_size: int,
    topk: int,
    dtype: torch.dtype,
-    use_fp8_w8a8: bool,
-    use_int8_w8a16: bool,
+    use_fp8: bool,
    num_iters: int = 100,
 ) -> float:
-    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    init_dtype = torch.float16 if use_fp8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    if use_int8_w8a16:
-        w1 = torch.randint(-127,
-                           127, (
-                               num_experts,
-                               shard_intermediate_size,
-                               hidden_size,
-                           ),
-                           dtype=torch.int8)
-        w2 = torch.randint(-127,
-                           127, (
-                               num_experts,
-                               hidden_size,
-                               shard_intermediate_size // 2,
-                           ),
-                           dtype=torch.int8)
-    else:
-        w1 = torch.randn(num_experts,
-                         shard_intermediate_size,
-                         hidden_size,
-                         dtype=init_dtype)
-        w2 = torch.randn(num_experts,
-                         hidden_size,
-                         shard_intermediate_size // 2,
-                         dtype=init_dtype)
+    w1 = torch.randn(num_experts,
+                     shard_intermediate_size,
+                     hidden_size,
+                     dtype=init_dtype)
+    w2 = torch.randn(num_experts,
+                     hidden_size,
+                     shard_intermediate_size // 2,
+                     dtype=init_dtype)
    gating_output = torch.randn(num_iters,
                                num_tokens,
                                num_experts,
@ -69,11 +52,7 @@ def benchmark_config(
    w2_scale = None
    a1_scale = None
    a2_scale = None
-    if use_int8_w8a16:
-        w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size),
-                               dtype=torch.float32)
-        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
-    if use_fp8_w8a8:
+    if use_fp8:
        w1_scale = torch.randn(num_experts, dtype=torch.float32)
        w2_scale = torch.randn(num_experts, dtype=torch.float32)
        a1_scale = torch.randn(1, dtype=torch.float32)
@ -97,8 +76,7 @@ def benchmark_config(
            renormalize=True,
            inplace=True,
            override_config=config,
-            use_fp8_w8a8=use_fp8_w8a8,
-            use_int8_w8a16=use_int8_w8a16,
+            use_fp8=use_fp8,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            a1_scale=a1_scale,
@ -166,7 +144,7 @@ class BenchmarkWorker:

    def __init__(self, seed: int) -> None:
        torch.set_default_device("cuda")
-        seed_everything(seed)
+        torch.cuda.manual_seed_all(seed)
        self.seed = seed

    def benchmark(
@ -177,13 +155,11 @@ class BenchmarkWorker:
        hidden_size: int,
        topk: int,
        dtype: torch.dtype,
-        use_fp8_w8a8: bool,
-        use_int8_w8a16: bool,
+        use_fp8: bool,
    ) -> Tuple[Dict[str, int], float]:
-        seed_everything(self.seed)
-        dtype_str = get_config_dtype_str(dtype,
-                                         use_int8_w8a16=use_int8_w8a16,
-                                         use_fp8_w8a8=use_fp8_w8a8)
+        torch.cuda.manual_seed_all(self.seed)
+
+        dtype_str = "float8" if use_fp8 else None
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
        # is the intermediate size after silu_and_mul.
        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
@ -197,8 +173,7 @@ class BenchmarkWorker:
                                   key=lambda x: abs(x - num_tokens))]
        kernel_time = benchmark_config(config, num_tokens, num_experts,
                                       shard_intermediate_size, hidden_size,
-                                       topk, dtype, use_fp8_w8a8,
-                                       use_int8_w8a16)
+                                       topk, dtype, use_fp8)
        return config, kernel_time

    def tune(
@ -209,10 +184,9 @@ class BenchmarkWorker:
        hidden_size: int,
        topk: int,
        dtype: torch.dtype,
-        use_fp8_w8a8: bool,
-        use_int8_w8a16: bool,
-        search_space: List[Dict[str, int]],
-    ) -> Dict[str, int]:
+        use_fp8: bool,
+        search_space: List[BenchmarkConfig],
+    ) -> BenchmarkConfig:
        best_config = None
        best_time = float("inf")
        for config in tqdm(search_space):
@ -224,8 +198,7 @@ class BenchmarkWorker:
                                               hidden_size,
                                               topk,
                                               dtype,
-                                               use_fp8_w8a8,
-                                               use_int8_w8a16,
+                                               use_fp8,
                                               num_iters=10)
            except triton.runtime.autotuner.OutOfResources:
                # Some configurations may be invalid and fail to compile.
@ -251,19 +224,20 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
    }


-def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
-                 shard_intermediate_size: int, hidden_size: int, topk: int,
-                 dtype: torch.dtype, use_fp8_w8a8: bool,
-                 use_int8_w8a16: bool) -> None:
-    dtype_str = get_config_dtype_str(dtype,
-                                     use_int8_w8a16=use_int8_w8a16,
-                                     use_fp8_w8a8=use_fp8_w8a8)
-
+def save_configs(
+    configs: Dict[int, BenchmarkConfig],
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8: bool,
+) -> None:
+    dtype_str = "float8" if use_fp8 else None
    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
    # is the intermediate size after silu_and_mul.
    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
                                    dtype_str)
-
    print(f"Writing best config to {filename}...")
    with open(filename, "w") as f:
        json.dump(configs, f, indent=4)
@ -279,11 +253,6 @@ def main(args: argparse.Namespace):
        topk = config.ffn_config.moe_top_k
        intermediate_size = config.ffn_config.ffn_hidden_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] == "JambaForCausalLM":
-        E = config.num_experts
-        topk = config.num_experts_per_tok
-        intermediate_size = config.intermediate_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
        # Default: Mixtral.
        E = config.num_local_experts
@ -293,8 +262,7 @@ def main(args: argparse.Namespace):

    hidden_size = config.hidden_size
    dtype = config.torch_dtype
-    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
-    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    use_fp8 = args.dtype == "fp8"

    if args.batch_size is None:
        batch_sizes = [
@ -326,21 +294,21 @@ def main(args: argparse.Namespace):
        start = time.time()
        configs = _distribute(
            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space)
+                      topk, dtype, use_fp8, search_space)
                     for batch_size in batch_sizes])
        best_configs = {
            M: sort_config(config)
            for M, config in zip(batch_sizes, configs)
        }
        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8_w8a8, use_int8_w8a16)
+                     topk, dtype, use_fp8)
        end = time.time()
        print(f"Tuning took {end - start:.2f} seconds")
    else:
-        outputs = _distribute(
-            "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
-                           topk, dtype, use_fp8_w8a8, use_int8_w8a16)
-                          for batch_size in batch_sizes])
+        outputs = _distribute("benchmark",
+                              [(batch_size, E, shard_intermediate_size,
+                                hidden_size, topk, dtype, use_fp8)
+                               for batch_size in batch_sizes])

        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
            print(f"Batch size: {batch_size}, config: {config}")
@ -355,7 +323,7 @@ if __name__ == "__main__":
    parser.add_argument("--tp-size", "-tp", type=int, default=2)
    parser.add_argument("--dtype",
                        type=str,
-                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
+                        choices=["auto", "fp8"],
                        default="auto")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -6,7 +6,7 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
-                        create_kv_caches_with_random, seed_everything)
+                        create_kv_caches_with_random)

 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@ -28,7 +28,10 @@ def main(
    device: str = "cuda",
    kv_cache_dtype: Optional[str] = None,
 ) -> None:
-    seed_everything(seed)
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)

    scale = float(1.0 / (head_size**0.5))
    query = torch.empty(num_seqs,
@ -172,7 +175,7 @@ if __name__ == '__main__':
    parser.add_argument("--num-kv-heads", type=int, default=8)
    parser.add_argument("--head-size",
                        type=int,
-                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 128, 192, 256],
                        default=128)
    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
    parser.add_argument("--use-alibi", action="store_true")
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -1,100 +0,0 @@
-import time
-
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
-                        seed_everything)
-
-
-@torch.inference_mode()
-def main(num_tokens: int,
-         hidden_size: int,
-         static_scale: bool,
-         quant_dtype: torch.dtype,
-         dtype: torch.dtype,
-         seed: int = 0,
-         do_profile: bool = False,
-         num_warmup_iters: int = 5,
-         num_iters: int = 100) -> None:
-    seed_everything(seed)
-    torch.set_default_device("cuda")
-
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
-
-    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        start_time = time.perf_counter()
-
-        for _ in range(num_iters):
-            if quant_dtype == torch.int8:
-                ops.scaled_int8_quant(x, scale)
-            else:
-                ops.scaled_fp8_quant(x, scale)
-        torch.cuda.synchronize()
-
-        end_time = time.perf_counter()
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        return (end_time - start_time) / num_iters
-
-    # Warmup.
-    print("Warming up...")
-    run_benchmark = run_cuda_benchmark
-    run_benchmark(num_iters=num_warmup_iters, profile=False)
-
-    # Benchmark.
-    if do_profile:
-        latency = run_benchmark(num_iters=1, profile=True)
-    else:
-        latency = run_benchmark(num_iters=num_iters, profile=False)
-    print(f"Kernel running time: {latency * 1000000:.3f} us")
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        raise ValueError(f"Unsupported dtype: {dt}")
-
-    parser = FlexibleArgumentParser(
-        description="Benchmark the quantization (fp8 or int8) kernel.")
-    parser.add_argument("--num-tokens", type=int, default=4096)
-    parser.add_argument("--hidden-size", type=int, default=8192)
-    parser.add_argument("--static-scale", action="store_true")
-    parser.add_argument("--quant-dtype",
-                        type=str,
-                        choices=["fp8", "int8"],
-                        default="int8")
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["half", "bfloat16", "float"],
-                        default="half")
-
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--profile", action="store_true")
-    parser.add_argument("--num-warmup-iters", type=int, default=5)
-    parser.add_argument("--num-iters",
-                        type=int,
-                        default=100,
-                        help="Number of benchmark iterations. "
-                        "If --profile is set, this number is ignored")
-
-    args = parser.parse_args()
-    print(args)
-
-    main(num_tokens=args.num_tokens,
-         hidden_size=args.hidden_size,
-         static_scale=args.static_scale,
-         quant_dtype=to_torch_dtype(args.quant_dtype),
-         dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
-         seed=args.seed,
-         do_profile=args.profile,
-         num_warmup_iters=args.num_warmup_iters,
-         num_iters=args.num_iters)
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -6,7 +6,7 @@ import torch

 from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
                                                         get_rope)
-from vllm.utils import FlexibleArgumentParser, seed_everything
+from vllm.utils import FlexibleArgumentParser


 def benchmark_rope_kernels_multi_lora(
@ -22,7 +22,9 @@ def benchmark_rope_kernels_multi_lora(
    max_position: int = 8192,
    base: int = 10000,
 ) -> None:
-    seed_everything(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
    if rotary_dim is None:
        rotary_dim = head_size
@ -31,7 +33,7 @@ def benchmark_rope_kernels_multi_lora(
    # batched RoPE can take multiple scaling factors
    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
                            is_neox_style, {
-                                "rope_type": "linear",
+                                "type": "linear",
                                "factor": tuple(scaling_factors)
                            })
    # non-batched RoPE takes only one scaling factor, we create multiple
@ -41,7 +43,7 @@ def benchmark_rope_kernels_multi_lora(
        non_batched_ropes.append(
            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
                     {
-                         "rope_type": "linear",
+                         "type": "linear",
                         "factor": (scaling_factor, )
                     }))

@ -92,7 +94,7 @@ if __name__ == '__main__':
    parser.add_argument("--num-heads", type=int, default=8)
    parser.add_argument("--head-size",
                        type=int,
-                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 128, 192, 256],
                        default=128)
    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
    parser.add_argument("--dtype",
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@ -1,62 +0,0 @@
-import math
-import pickle
-import re
-from collections import defaultdict
-from typing import List
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-from torch.utils.benchmark import Measurement as TMeasurement
-
-from vllm.utils import FlexibleArgumentParser
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description='Benchmark the latency of processing a single batch of '
-        'requests till completion.')
-    parser.add_argument('filename', type=str)
-
-    args = parser.parse_args()
-
-    with open(args.filename, 'rb') as f:
-        data: List[TMeasurement] = pickle.load(f)
-
-    results = defaultdict(lambda: list())
-    for v in data:
-        result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
-        if result is not None:
-            KN = result.group(1)
-        else:
-            raise Exception("MKN not found")
-        result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label)
-        if result is not None:
-            M = result.group(1)
-        else:
-            raise Exception("MKN not found")
-
-        kernel = v.task_spec.description
-        results[KN].append({
-            "kernel": kernel,
-            "batch_size": M,
-            "median": v.median
-        })
-
-    rows = int(math.ceil(len(results) / 2))
-    fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
-    axs = axs.flatten()
-    for axs_idx, (shape, data) in enumerate(results.items()):
-        plt.sca(axs[axs_idx])
-        df = pd.DataFrame(data)
-        sns.lineplot(data=df,
-                     x="batch_size",
-                     y="median",
-                     hue="kernel",
-                     style="kernel",
-                     markers=True,
-                     dashes=False,
-                     palette="Dark2")
-        plt.title(f"Shape: {shape}")
-        plt.ylabel("time (median, s)")
-    plt.tight_layout()
-    plt.savefig("graph_machete_bench.pdf")
--- a/benchmarks/kernels/requirements.txt
+++ b/benchmarks/kernels/requirements.txt
@ -1 +0,0 @@
-pandas
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@ -1,43 +0,0 @@
-# Weight Shapes are in the format
-# ([K, N], TP_SPLIT_DIM)
-# Example:
-#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
-#   - TP1 : K = 14336, N = 4096
-#   - TP2 : K = 7168, N = 4096
-#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
-#   - TP1 : K = 4096, N = 6144
-#   - TP4 : K = 4096, N = 1536
-
-# TP1 shapes
-WEIGHT_SHAPES = {
-    "mistralai/Mistral-7B-v0.1": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-7b-hf": [
-        ([4096, 12288], 1),
-        ([4096, 4096], 0),
-        ([4096, 22016], 1),
-        ([11008, 4096], 0),
-    ],
-    "meta-llama/Llama-3-8b": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-13b-hf": [
-        ([5120, 15360], 1),
-        ([5120, 5120], 0),
-        ([5120, 27648], 1),
-        ([13824, 5120], 0),
-    ],
-    "meta-llama/Llama-2-70b-hf": [
-        ([8192, 10240], 1),
-        ([8192, 8192], 0),
-        ([8192, 57344], 1),
-        ([28672, 8192], 0),
-    ],
-}
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@ -6,7 +6,7 @@ TOKENS=$2

 docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
           -v $PWD/data:/data \
-           ghcr.io/huggingface/text-generation-inference:2.2.0 \
+           ghcr.io/huggingface/text-generation-inference:1.4.0 \
           --model-id $MODEL \
           --sharded false  \
           --max-input-length 1024 \
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@ -16,6 +16,7 @@ def main(args):
        enforce_eager=True,
        enable_prefix_caching=True,
        tensor_parallel_size=args.tensor_parallel_size,
+        use_v2_block_manager=args.use_v2_block_manager,
    )

    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
@ -55,5 +56,8 @@ if __name__ == "__main__":
    parser.add_argument('--enable-prefix-caching',
                        action='store_true',
                        help='enable prefix caching')
+    parser.add_argument('--use-v2-block-manager',
+                        action='store_true',
+                        help='Use BlockSpaceMangerV2')
    args = parser.parse_args()
    main(args)
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -1,7 +1,3 @@
-include(FetchContent)
-
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

 #
@ -85,39 +81,14 @@ else()
    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
 endif()

-#
-# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
-#
-if (AVX512_FOUND AND NOT AVX512_DISABLED)
-    FetchContent_Declare(
-        oneDNN
-        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG  v3.5.3
-        GIT_PROGRESS TRUE
-        GIT_SHALLOW TRUE
-    )
-
-    set(ONEDNN_LIBRARY_TYPE "STATIC")
-    set(ONEDNN_BUILD_DOC "OFF")
-    set(ONEDNN_BUILD_EXAMPLES "OFF")
-    set(ONEDNN_BUILD_TESTS "OFF")
-    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
-    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
-    set(ONEDNN_BUILD_GRAPH "OFF")
-    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
-    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
-    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
-    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
-    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
-
-    FetchContent_MakeAvailable(oneDNN)
-    
-    list(APPEND LIBS dnnl)
-endif()
-
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")

-list(APPEND LIBS numa)
+list(APPEND LIBS "numa")
+
+
+#
+# Define extension targets
+#

 #
 # _C extension
@ -131,16 +102,6 @@ set(VLLM_EXT_SRC
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/torch_bindings.cpp")

-if (AVX512_FOUND AND NOT AVX512_DISABLED)
-    set(VLLM_EXT_SRC
-        "csrc/cpu/quant.cpp"
-        ${VLLM_EXT_SRC})
-endif()
-
-#
-# Define extension targets
-#
-
 define_gpu_extension_target(
    _C
    DESTINATION vllm
@ -152,4 +113,6 @@ define_gpu_extension_target(
    WITH_SOABI
 )

+add_custom_target(default)
 message(STATUS "Enabling C extension.")
+add_dependencies(default _C)
--- a/Show More
+++ b/Show More