format

Fix hashing logic for non-full blocks
2024-06-02 00:02:54 +00:00 · 2024-06-02 00:01:30 +00:00
776 changed files with 19312 additions and 81295 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,7 +1,7 @@
 import os
 import zipfile

-MAX_SIZE_MB = 250
+MAX_SIZE_MB = 200


 def print_top_10_largest_files(zip_file):
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+
+# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
+mkdir -p images
+cd images
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
+
+cd -
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -1,11 +0,0 @@
-# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
-model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.671
-  - name: "exact_match,flexible-extract"
-    value: 0.664
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
-model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.905
-  - name: "exact_match,flexible-extract"
-    value: 0.905
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
-model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.892
-  - name: "exact_match,flexible-extract"
-    value: 0.892
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
-model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.752
-  - name: "exact_match,flexible-extract"
-    value: 0.754
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
-model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.753
-  - name: "exact_match,flexible-extract"
-    value: 0.753
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
-model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.755
-  - name: "exact_match,flexible-extract"
-    value: 0.755
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
-model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.753
-  - name: "exact_match,flexible-extract"
-    value: 0.753
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
-model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.728
-  - name: "exact_match,flexible-extract"
-    value: 0.728
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
-model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.758
-  - name: "exact_match,flexible-extract"
-    value: 0.759
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
-model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.756
-  - name: "exact_match,flexible-extract"
-    value: 0.752
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
-model_name: "HandH1998/QQQ-Llama-3-8b-g128"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.409
-  - name: "exact_match,flexible-extract"
-    value: 0.406
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
-model_name: "nvidia/Minitron-4B-Base"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.252
-  - name: "exact_match,flexible-extract"
-    value: 0.252
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@ -1,11 +0,0 @@
-# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
-model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.86
-  - name: "exact_match,flexible-extract"
-    value: 0.86
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@ -1,11 +0,0 @@
-# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
-model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.624
-  - name: "exact_match,flexible-extract"
-    value: 0.624
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
-model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.616
-  - name: "exact_match,flexible-extract"
-    value: 0.632
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
-model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.578
-  - name: "exact_match,flexible-extract"
-    value: 0.585
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
-model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.593
-  - name: "exact_match,flexible-extract"
-    value: 0.588
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
-model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.595
-  - name: "exact_match,flexible-extract"
-    value: 0.582
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@ -1,11 +0,0 @@
-# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
-model_name: "Qwen/Qwen2-57B-A14B-Instruct"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.792
-  - name: "exact_match,flexible-extract"
-    value: 0.824
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@ -1,5 +0,0 @@
-Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
-Meta-Llama-3-70B-Instruct.yaml
-Mixtral-8x7B-Instruct-v0.1.yaml
-Qwen2-57B-A14-Instruct.yaml
-DeepSeek-V2-Lite-Chat.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,10 +0,0 @@
-Meta-Llama-3-8B-Instruct.yaml
-Meta-Llama-3-8B-Instruct-FP8.yaml
-Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base.yaml
-Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
-Qwen2-1.5B-Instruct-FP8W8.yaml
-Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -1,46 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on GSM for transformers.
-#
-# Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using huggingface transformers."
-    echo "This pathway is intended to be used to create baselines for "
-    echo "our automated nm-test-accuracy workflow"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -m    - huggingface stub or local directory of the model"
-    echo "  -b    - batch size to run the evaluation at"
-    echo "  -l    - limit number of samples to run"
-    echo "  -f    - number of fewshot samples to use"
-    echo
-}
-
-while getopts "m:b:l:f:" OPT; do
-  case ${OPT} in
-    m ) 
-        MODEL="$OPTARG"
-        ;;
-    b ) 
-        BATCH_SIZE="$OPTARG"
-        ;;
-    l ) 
-        LIMIT="$OPTARG"
-        ;;
-    f ) 
-        FEWSHOT="$OPTARG"
-        ;;
-    \? ) 
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-lm_eval --model hf \
-  --model_args pretrained=$MODEL,parallelize=True \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -1,51 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on GSM for vllm.
-# We use this for fp8, which HF does not support.
-#
-# Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.3
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using huggingface transformers."
-    echo "This pathway is intended to be used to create baselines for "
-    echo "our automated nm-test-accuracy workflow"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -m    - huggingface stub or local directory of the model"
-    echo "  -b    - batch size to run the evaluation at"
-    echo "  -l    - limit number of samples to run"
-    echo "  -f    - number of fewshot samples to use"
-    echo "  -t    - tensor parallel size to run at"
-    echo
-}
-
-while getopts "m:b:l:f:t:" OPT; do
-  case ${OPT} in
-    m ) 
-        MODEL="$OPTARG"
-        ;;
-    b ) 
-        BATCH_SIZE="$OPTARG"
-        ;;
-    l ) 
-        LIMIT="$OPTARG"
-        ;;
-    f ) 
-        FEWSHOT="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? ) 
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -1,59 +0,0 @@
-#!/bin/bash
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using vllm and compares to "
-    echo "precomputed baseline (measured by HF transformers.)"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
-    echo "  -t    - tensor parallel size"
-    echo
-}
-
-SUCCESS=0
-
-while getopts "c:t:" OPT; do
-  case ${OPT} in
-    c ) 
-        CONFIG="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? )
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-# Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
-
-for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
-do
-    LOCAL_SUCCESS=0
-    
-    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
-
-    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
-    export LM_EVAL_TP_SIZE=$TP_SIZE
-    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
-
-    if [[ $LOCAL_SUCCESS == 0 ]]; then
-        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
-    else
-        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
-    fi
-
-    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
-
-done
-
-if [ "${SUCCESS}" -eq "0" ]; then
-    exit 0
-else
-    exit 1
-fi
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -1,55 +0,0 @@
-"""
-LM eval harness on model to compare vs HF baseline computed offline.
-Configs are found in configs/$MODEL.yaml
-
-* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
-* export LM_EVAL_TP_SIZE=4 
-* pytest -s test_lm_eval_correctness.py
-"""
-
-import os
-from pathlib import Path
-
-import lm_eval
-import numpy
-import yaml
-
-RTOL = 0.02
-TEST_DATA_FILE = os.environ.get(
-    "LM_EVAL_TEST_DATA_FILE",
-    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
-
-TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
-
-
-def launch_lm_eval(eval_config):
-    model_args = f"pretrained={eval_config['model_name']}," \
-                 f"tensor_parallel_size={TP_SIZE}," \
-                 f"add_bos_token=true"
-
-    results = lm_eval.simple_evaluate(
-        model="vllm",
-        model_args=model_args,
-        tasks=[task["name"] for task in eval_config["tasks"]],
-        num_fewshot=eval_config["num_fewshot"],
-        limit=eval_config["limit"],
-        batch_size="auto")
-
-    return results
-
-
-def test_lm_eval_correctness():
-    eval_config = yaml.safe_load(
-        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
-
-    # Launch eval requests.
-    results = launch_lm_eval(eval_config)
-
-    # Confirm scores match ground truth.
-    for task in eval_config["tasks"]:
-        for metric in task["metrics"]:
-            ground_truth = metric["value"]
-            measured_value = results["results"][task["name"]][metric["name"]]
-            print(f'{task["name"]} | {metric["name"]}: '
-                  f'ground_truth={ground_truth} | measured={measured_value}')
-            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -1,152 +0,0 @@
-# vLLM benchmark suite
-
-
-## Introduction
-
-This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
-
-
-See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
-
-
-## Performance benchmark quick overview
-
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
-
-**Benchmarking Duration**: about 1hr.
-
-**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
-
-
-## Nightly benchmark quick overview
-
-**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. 
-
-**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
-
-**Benchmarking Duration**: about 3.5hrs.
-
-
-
-## Trigger the benchmark
-
-Performance benchmark will be triggered when:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label.
-
-Nightly benchmark will be triggered when:
- Every commit for those PRs with `nightly-benchmarks` label.
-
-
-
-
-## Performance benchmark details
-
-See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
-
-
-#### Latency test
-
-Here is an example of one test inside `latency-tests.json`:
-
-```json
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    },
-]
-```
-
-In this example:
-  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
-
-Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
-
-WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
-
-
-#### Throughput test
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
-
-The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
-
-#### Serving test
-We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
-
-```
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-]
-```
-
-Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
-
-The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
-
-WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
-
-#### Visualizing the results
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
-You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
-If you do not see the table, please wait till the benchmark finish running.
-The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
-The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
-
-
-
-## Nightly test details
-
-See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
-
-
-#### Workflow
-
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. 
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
-
-#### Nightly tests
-
-In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
-
-#### Docker containers
-
-The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
-
-WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
-
-WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
-
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -1,61 +0,0 @@
-steps:
-  - label: "Wait for container to be ready"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: badouralix/curl-jq
-            command:
-            - sh
-            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-  - wait
-  - label: "A100"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  # - label: "H100"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #       command:
-  #       - bash
-  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-  #       mount-buildkite-agent: true
-  #       propagate-environment: true
-  #       ipc: host
-  #       gpus: all
-  #       environment:
-  #       - VLLM_USAGE_SOURCE
-  #       - HF_TOKEN
-
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -1,45 +0,0 @@
-
-# Nightly benchmark
-
-The main goal of this benchmarking is two-fold:
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
-
-
-## Docker images
-
-We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
- vllm/vllm-openai:v0.5.0.post1
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
- openmmlab/lmdeploy:v0.5.0
- ghcr.io/huggingface/text-generation-inference:2.1
-
-<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
-
-
-## Hardware
-
-One AWS node with 8x NVIDIA A100 GPUs.
-
-
-## Workload description
-
-We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
-
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 500 prompts.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
-
-<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
-
-## Plots
-
-In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
-
-<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
-
-## Results
-
-{nightly_results_benchmarking_table}
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@ -1,120 +0,0 @@
-common_pod_spec: &common_pod_spec
-  priorityClassName: perf-benchmark
-  nodeSelector:
-    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  volumes:
-    - name: devshm
-      emptyDir:
-        medium: Memory
-    - name: hf-cache
-      hostPath:
-        path: /root/.cache/huggingface
-        type: Directory
-
-common_container_settings: &common_container_settings
-  command:
-    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  resources:
-    limits:
-      nvidia.com/gpu: 8
-  volumeMounts:
-    - name: devshm
-      mountPath: /dev/shm
-    - name: hf-cache
-      mountPath: /root/.cache/huggingface
-  env:
-    - name: VLLM_USAGE_SOURCE
-      value: ci-test
-    - name: HF_HOME
-      value: /root/.cache/huggingface
-    - name: VLLM_SOURCE_CODE_LOC
-      value: /workspace/build/buildkite/vllm/performance-benchmark
-    - name: HF_TOKEN
-      valueFrom:
-        secretKeyRef:
-          name: hf-token-secret
-          key: token
-
-steps:
-  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-  - label: "A100 trt benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-                <<: *common_container_settings
-
-  - label: "A100 lmdeploy benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: openmmlab/lmdeploy:v0.5.0
-                <<: *common_container_settings
-  
-
-  - label: "A100 vllm benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: vllm/vllm-openai:latest 
-                <<: *common_container_settings
-
-  - label: "A100 tgi benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: ghcr.io/huggingface/text-generation-inference:2.1 
-                <<: *common_container_settings
-        
-  - wait
-
-  - label: "Plot"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-            - image: vllm/vllm-openai:v0.5.0.post1
-              command:
-              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
-              resources:
-                limits:
-                  nvidia.com/gpu: 8
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: VLLM_SOURCE_CODE_LOC
-                value: /workspace/build/buildkite/vllm/performance-benchmark
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-
-  - wait
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@ -1,380 +0,0 @@
-#!/bin/bash
-
-# This script should be run inside the CI process
-# This script assumes that we are already inside the vllm/ directory
-# Benchmarking results will be available inside vllm/benchmarks/results/
-
-# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
-# and we still want to see other benchmarking results even when mixtral crashes.
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-  # check if HF_TOKEN is available and valid
-  if [[ -z "$HF_TOKEN" ]]; then
-    echo "Error: HF_TOKEN is not set."
-    exit 1
-  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-    echo "Error: HF_TOKEN does not start with 'hf_'."
-    exit 1
-  else
-    echo "HF_TOKEN is set and valid."
-  fi
-}
-
-ensure_sharegpt_downloaded() {
-  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
-  if [ ! -f "$FILE" ]; then
-      wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
-  else
-      echo "$FILE already exists."
-  fi
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -X POST localhost:8000/v1/completions; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-kill_gpu_processes() {
-  # kill all processes on GPU.
-  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
-  if [ -z "$pids" ]; then
-      echo "No GPU processes found."
-  else
-      for pid in $pids; do
-          kill -9 "$pid"
-          echo "Killed process with PID: $pid"
-      done
-
-      echo "All GPU processes have been killed."
-  fi
-
-  # waiting for GPU processes to be fully killed
-  # loop while nvidia-smi returns any processes
-  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
-    sleep 1
-    echo "Waiting for GPU processes to be killed"
-  done
-
-  # remove vllm config file
-  rm -rf ~/.config/vllm
-
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
-  if command -v buildkite-agent >/dev/null 2>&1; then
-    BUILDKITE_AGENT_COMMAND="buildkite-agent"
-  elif [ -f /workspace/buildkite-agent ]; then
-    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
-  else
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-
-  # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
-  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
-}
-
-run_latency_tests() {
-  # run latency tests using `benchmark_latency.py`
-  # $1: a json file specifying latency test cases
-
-  local latency_test_file
-  latency_test_file=$1
-
-  # Iterate over latency tests
-  jq -c '.[]' "$latency_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^latency_ ]]; then
-      echo "In latency-test.json, test_name must start with \"latency_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # get arguments
-    latency_params=$(echo "$params" | jq -r '.parameters')
-    latency_args=$(json2args "$latency_params")
-
-    # check if there is enough GPU to run the test
-    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
-      continue
-    fi
-
-    latency_command="python3 benchmark_latency.py \
-      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $latency_args"
-
-    echo "Running test case $test_name"
-    echo "Latency command: $latency_command"
-
-    # recoding benchmarking command ang GPU command
-    jq_output=$(jq -n \
-      --arg latency "$latency_command" \
-      --arg gpu "$gpu_type" \
-      '{
-        latency_command: $latency,
-        gpu_type: $gpu
-      }')
-    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
-
-    # run the benchmark
-    eval "$latency_command"
-
-    kill_gpu_processes
-
-  done
-}
-
-
-run_throughput_tests() {
-  # run throughput tests using `benchmark_throughput.py`
-  # $1: a json file specifying throughput test cases
-
-  local throughput_test_file
-  throughput_test_file=$1
-
-  # Iterate over throughput tests
-  jq -c '.[]' "$throughput_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^throughput_ ]]; then
-      echo "In throughput-test.json, test_name must start with \"throughput_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # get arguments
-    throughput_params=$(echo "$params" | jq -r '.parameters')
-    throughput_args=$(json2args "$throughput_params")
-
-    # check if there is enough GPU to run the test
-    tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
-      continue
-    fi
-
-    throughput_command="python3 benchmark_throughput.py \
-      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $throughput_args"
-
-    echo "Running test case $test_name"
-    echo "Throughput command: $throughput_command"
-    # recoding benchmarking command ang GPU command
-    jq_output=$(jq -n \
-      --arg command "$throughput_command" \
-      --arg gpu "$gpu_type" \
-      '{
-        throughput_command: $command,
-        gpu_type: $gpu
-      }')
-    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
-
-    # run the benchmark
-    eval "$throughput_command"
-
-    kill_gpu_processes
-
-  done
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^serving_ ]]; then
-      echo "In serving-test.json, test_name must start with \"serving_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.server_parameters')
-    client_params=$(echo "$params" | jq -r '.client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
-      continue
-    fi
-
-    # check if server model and client model is aligned
-    server_model=$(echo "$server_params" | jq -r '.model')
-    client_model=$(echo "$client_params" | jq -r '.model')
-    if [[ $server_model != "$client_model" ]]; then
-      echo "Server model and client model must be the same. Skip testcase $testname."
-      continue
-    fi
-
-    server_command="python3 \
-      -m vllm.entrypoints.openai.api_server \
-      $server_args"
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    eval "$server_command" &
-    server_pid=$!
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "vllm server is up and running."
-    else
-      echo ""
-      echo "vllm failed to start within the timeout period."
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu
-        }')
-      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill -9 $server_pid
-    kill_gpu_processes
-  done
-}
-
-main() {
-  check_gpus
-  check_hf_token
-
-  # dependencies
-  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-  (which jq) || (apt-get update && apt-get -y install jq)
-
-  # get the current IP address, required by benchmark_serving.py
-  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
-  # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOG_LEVEL="WARNING"
-
-  # prepare for benchmarking
-  cd benchmarks || exit 1
-  ensure_sharegpt_downloaded
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
-  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
-  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
-
-
-  # postprocess benchmarking results
-  pip install tabulate pandas
-  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
-
-  upload_to_buildkite
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@ -1,76 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -x
-
-check_gpus() {
-    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-    if [[ $gpu_count -gt 0 ]]; then
-        echo "GPU found."
-    else
-        echo "Need at least 1 GPU to run benchmarking."
-        exit 1
-    fi
-    declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-    echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-    # check if HF_TOKEN is available and valid
-    if [[ -z "$HF_TOKEN" ]]; then
-        echo "Error: HF_TOKEN is not set."
-        exit 1
-    elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-        echo "Error: HF_TOKEN does not start with 'hf_'."
-        exit 1
-    else
-        echo "HF_TOKEN is set and valid."
-    fi
-}
-
-main() {
-
-    check_gpus
-    check_hf_token
-
-    df -h
-
-    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-    (which jq) || (apt-get update && apt-get -y install jq)
-
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-    
-
-    # run lmdeploy
-    if which lmdeploy >/dev/null; then
-        echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
-        exit 0
-    fi
-
-    # run tgi
-    if [ -e /tgi-entrypoint.sh ]; then
-        echo "tgi is available, redirect to run-tgi-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
-        exit 0
-    fi
-
-    # run trt
-    if which trtllm-build >/dev/null; then
-        echo "trtllm is available, redirect to run-trt-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
-        exit 0
-    fi
-
-    # run vllm
-    if [ -e /vllm-workspace ]; then
-        echo "vllm is available, redirect to run-vllm-nightly.sh"
-        bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
-        exit 0
-    fi
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,192 +0,0 @@
-import json
-import os
-from pathlib import Path
-
-import pandas as pd
-from tabulate import tabulate
-
-results_folder = Path("results/")
-
-# latency results and the keys that will be printed into markdown
-latency_results = []
-latency_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    "avg_latency": "Mean latency (ms)",
-    # "P10": "P10 (s)",
-    # "P25": "P25 (s)",
-    "P50": "Median latency (ms)",
-    # "P75": "P75 (s)",
-    # "P90": "P90 (s)",
-    "P99": "P99 latency (ms)",
-}
-
-# throughput tests and the keys that will be printed into markdown
-throughput_results = []
-throughput_results_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    # "num_requests": "# of req.",
-    # "total_num_tokens": "Total # of tokens",
-    # "elapsed_time": "Elapsed time (s)",
-    "requests_per_second": "Tput (req/s)",
-    # "tokens_per_second": "Tput (tok/s)",
-}
-
-# serving results and the keys that will be printed into markdown
-serving_results = []
-serving_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    # "completed": "# of req.",
-    "request_throughput": "Tput (req/s)",
-    # "input_throughput": "Input Tput (tok/s)",
-    # "output_throughput": "Output Tput (tok/s)",
-    "mean_ttft_ms": "Mean TTFT (ms)",
-    "median_ttft_ms": "Median TTFT (ms)",
-    "p99_ttft_ms": "P99 TTFT (ms)",
-    # "mean_tpot_ms": "Mean TPOT (ms)",
-    # "median_tpot_ms": "Median",
-    # "p99_tpot_ms": "P99",
-    "mean_itl_ms": "Mean ITL (ms)",
-    "median_itl_ms": "Median ITL (ms)",
-    "p99_itl_ms": "P99 ITL (ms)",
-}
-
-
-def read_markdown(file):
-    if os.path.exists(file):
-        with open(file, "r") as f:
-            return f.read() + "\n"
-    else:
-        return f"{file} not found.\n"
-
-
-def results_to_json(latency, throughput, serving):
-    return json.dumps({
-        'latency': latency.to_dict(),
-        'throughput': throughput.to_dict(),
-        'serving': serving.to_dict()
-    })
-
-
-if __name__ == "__main__":
-
-    # collect results
-    for test_file in results_folder.glob("*.json"):
-
-        with open(test_file, "r") as f:
-            raw_result = json.loads(f.read())
-
-        if "serving" in str(test_file):
-            # this result is generated via `benchmark_serving.py`
-
-            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
-                command = json.loads(f.read())
-            raw_result.update(command)
-
-            # update the test name of this result
-            raw_result.update({"test_name": test_file.stem})
-
-            # add the result to raw_result
-            serving_results.append(raw_result)
-            continue
-
-        elif "latency" in f.name:
-            # this result is generated via `benchmark_latency.py`
-
-            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
-                command = json.loads(f.read())
-            raw_result.update(command)
-
-            # update the test name of this result
-            raw_result.update({"test_name": test_file.stem})
-
-            # get different percentiles
-            for perc in [10, 25, 50, 75, 90, 99]:
-                # Multiply 1000 to convert the time unit from s to ms
-                raw_result.update(
-                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
-            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
-
-            # add the result to raw_result
-            latency_results.append(raw_result)
-            continue
-
-        elif "throughput" in f.name:
-            # this result is generated via `benchmark_throughput.py`
-
-            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
-                command = json.loads(f.read())
-            raw_result.update(command)
-
-            # update the test name of this result
-            raw_result.update({"test_name": test_file.stem})
-
-            # add the result to raw_result
-            throughput_results.append(raw_result)
-            continue
-
-        print(f"Skipping {test_file}")
-
-    latency_results = pd.DataFrame.from_dict(latency_results)
-    serving_results = pd.DataFrame.from_dict(serving_results)
-    throughput_results = pd.DataFrame.from_dict(throughput_results)
-
-    raw_results_json = results_to_json(latency_results, throughput_results,
-                                       serving_results)
-
-    # remapping the key, for visualization purpose
-    if not latency_results.empty:
-        latency_results = latency_results[list(
-            latency_column_mapping.keys())].rename(
-                columns=latency_column_mapping)
-    if not serving_results.empty:
-        serving_results = serving_results[list(
-            serving_column_mapping.keys())].rename(
-                columns=serving_column_mapping)
-    if not throughput_results.empty:
-        throughput_results = throughput_results[list(
-            throughput_results_column_mapping.keys())].rename(
-                columns=throughput_results_column_mapping)
-
-    processed_results_json = results_to_json(latency_results,
-                                             throughput_results,
-                                             serving_results)
-
-    # get markdown tables
-    latency_md_table = tabulate(latency_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
-    serving_md_table = tabulate(serving_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
-    throughput_md_table = tabulate(throughput_results,
-                                   headers='keys',
-                                   tablefmt='pipe',
-                                   showindex=False)
-
-    # document the result
-    with open(results_folder / "benchmark_results.md", "w") as f:
-
-        results = read_markdown(
-            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
-        results = results.format(
-            latency_tests_markdown_table=latency_md_table,
-            throughput_tests_markdown_table=throughput_md_table,
-            serving_tests_markdown_table=serving_md_table,
-            benchmarking_results_in_json_string=processed_results_json)
-        f.write(results)
-
-    # document benchmarking results in json
-    with open(results_folder / "benchmark_results.json", "w") as f:
-
-        results = latency_results.to_dict(
-            orient='records') + throughput_results.to_dict(
-                orient='records') + serving_results.to_dict(orient='records')
-        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -1,26 +0,0 @@
-import argparse
-
-from transformers import AutoTokenizer
-
-
-def main(model, cachedir):
-    # Load the tokenizer and save it to the specified directory
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    tokenizer.save_pretrained(cachedir)
-    print(f"Tokenizer saved to {cachedir}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Download and save Hugging Face tokenizer")
-    parser.add_argument("--model",
-                        type=str,
-                        required=True,
-                        help="Name of the model")
-    parser.add_argument("--cachedir",
-                        type=str,
-                        required=True,
-                        help="Directory to save the tokenizer")
-
-    args = parser.parse_args()
-    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -1,6 +0,0 @@
-from lmdeploy.serve.openai.api_client import APIClient
-
-api_client = APIClient("http://localhost:8000")
-model_name = api_client.available_models[0]
-
-print(model_name)
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@ -1,102 +0,0 @@
-#!/bin/bash
-
-
-server_params=$1
-common_params=$2
-
-
-
-model_path=$(echo "$common_params" | jq -r '.model')
-model_name="${model_path#*/}"
-model_type=$(echo "$server_params" | jq -r '.model_type')
-model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
-model_tp_size=$(echo "$common_params" | jq -r '.tp')
-max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
-max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
-max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
-trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
-
-cd ~
-rm -rf models
-mkdir -p models
-cd models
-models_dir=$(pwd)
-trt_model_path=${models_dir}/${model_name}-trt-ckpt
-trt_engine_path=${models_dir}/${model_name}-trt-engine
-
-cd ~
-rm -rf tensorrt-demo
-git clone https://github.com/neuralmagic/tensorrt-demo.git
-cd tensorrt-demo
-tensorrt_demo_dir=$(pwd)
-
-# make sure the parameter inside tensorrt_demo is consistent to envvar
-sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
-sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
-
-
-cd /
-rm -rf tensorrtllm_backend
-git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
-git lfs install
-cd tensorrtllm_backend
-git checkout $trt_llm_version
-tensorrtllm_backend_dir=$(pwd)
-git submodule update --init --recursive
-cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
-
-cd /tensorrtllm_backend
-cd ./tensorrt_llm/examples/${model_type}
-
-
-if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
-
-    echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
-    echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
-    python ../quantization/quantize.py \
-        --model_dir ${model_path} \
-        --dtype ${model_dtype} \
-        --tp_size ${model_tp_size} \
-        --output_dir ${trt_model_path} \
-        --qformat fp8 \
-        --kv_cache_dtype fp8 \
-        --calib_size 2
-
-else
-
-    echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
-    python3 convert_checkpoint.py \
-        --model_dir ${model_path} \
-        --dtype ${model_dtype} \
-        --tp_size ${model_tp_size} \
-        --output_dir ${trt_model_path}
-
-fi
-
-
-
-trtllm-build \
--checkpoint_dir=${trt_model_path} \
--gpt_attention_plugin=${model_dtype} \
--gemm_plugin=${model_dtype} \
--remove_input_padding=enable \
--paged_kv_cache=enable \
--tp_size=${model_tp_size} \
--max_batch_size=${max_batch_size} \
--max_input_len=${max_input_len} \
--max_output_len=${max_output_len} \
--max_num_tokens=${max_output_len} \
--opt_num_tokens=${max_output_len} \
--output_dir=${trt_engine_path} 
-
-cd /tensorrtllm_backend/triton_model_repo
-rm -rf ./tensorrt_llm/1/*
-cp -r ${trt_engine_path}/* ./tensorrt_llm/1
-cd /tensorrtllm_backend
-python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@ -1,40 +0,0 @@
-#!/bin/bash
-
-set -ex
-set -o pipefail
-
-
-main() {
-
-    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-    (which jq) || (apt-get update && apt-get -y install jq)
-
-    if [ ! -f /workspace/buildkite-agent ]; then
-        echo "buildkite-agent binary not found. Skip plotting the results."
-        exit 0
-    fi
-
-    # initial annotation
-    description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
-
-    # download results
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    mkdir -p results/
-    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
-    ls
-    ls results/
-
-    # generate figures
-    python3 -m pip install tabulate pandas matplotlib
-    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-        --description $description \
-        --results-folder results/
-    
-    # upload results and figures
-    /workspace/buildkite-agent artifact upload "nightly_results.png"
-    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
-    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
-    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@ -1,135 +0,0 @@
-import argparse
-import json
-import math
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import pandas as pd
-from tabulate import tabulate
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description=
-        'Parse command line arguments for summary-nightly-results script.')
-    parser.add_argument('--results-folder',
-                        type=str,
-                        required=True,
-                        help='The folder where the results are stored.')
-    parser.add_argument('--description',
-                        type=str,
-                        required=True,
-                        help='Description of the results.')
-
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
-    results_folder = Path(args.results_folder)
-
-    results = []
-
-    # collect results
-    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
-            results = results + json.loads(f.read())
-
-    # generate markdown table
-    df = pd.DataFrame.from_dict(results)
-
-    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
-
-    with open(args.description, "r") as f:
-        description = f.read()
-
-    description = description.format(
-        nightly_results_benchmarking_table=md_table)
-
-    with open("nightly_results.md", "w") as f:
-        f.write(description)
-
-    plt.rcParams.update({'font.size': 20})
-
-    # plot results
-    fig, axes = plt.subplots(3, 3, figsize=(16, 14))
-    fig.subplots_adjust(hspace=1)
-    methods = ["vllm", "trt", "lmdeploy", "tgi"]
-    for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
-        for j, metric in enumerate(["TTFT", "ITL"]):
-            means, stds = [], []
-            for method in methods:
-                target = df['Test name'].str.contains(model)
-                target = target & df['Engine'].str.contains(method)
-                filtered_df = df[target]
-
-                if filtered_df.empty:
-                    means.append(0.)
-                    stds.append(0.)
-                else:
-                    means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
-                    std = filtered_df[f"Std {metric} (ms)"].values[0]
-                    success = filtered_df["Successful req."].values[0]
-                    stds.append(std / math.sqrt(success))
-
-            print(model, metric)
-            print(means, stds)
-
-            ax = axes[i, j + 1]
-
-            bars = ax.bar(
-                ["vllm", "trt", "lmdeploy", "tgi"],
-                means,
-                yerr=stds,
-                capsize=10,
-            )
-            for idx, bar in enumerate(bars):
-                bar.set_color(bar_colors[idx])
-            ax.set_ylim(bottom=0)
-
-            ax.set_ylabel(f"{metric} (ms)")
-            ax.set_title(f"{model} {metric}")
-            ax.grid(axis='y')
-
-        metric = "Tput"
-        j = 0
-        if True:
-            tputs = []
-            for method in methods:
-                target = df['Test name'].str.contains(model)
-                target = target & df['Engine'].str.contains(method)
-                filtered_df = df[target]
-
-                if filtered_df.empty:
-                    tputs.append(0.)
-                else:
-                    input_tput = filtered_df["Input Tput (tok/s)"].values[0]
-                    output_tput = filtered_df["Output Tput (tok/s)"].values[0]
-                    tputs.append(input_tput + output_tput)
-
-            print(model, metric)
-            print(tputs)
-
-            ax = axes[i, j]
-
-            bars = ax.bar(
-                ["vllm", "trt", "lmdeploy", "tgi"],
-                tputs,
-            )
-            for idx, bar in enumerate(bars):
-                bar.set_color(bar_colors[idx])
-
-            ax.set_ylim(bottom=0)
-
-            ax.set_ylabel("Tput (token/s)")
-            ax.set_title(f"{model} {metric}")
-            ax.grid(axis='y')
-
-    fig.tight_layout()
-    fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@ -1,218 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  pkill lmdeploy || true
-  # waiting for GPU processes to be fully killed
-  sleep 10
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-    
-    # append lmdeploy to the test name
-    test_name=lmdeploy_$test_name
-    
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
-    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    # prepare tokenizer
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-
-    server_command="lmdeploy serve api_server $model \
-      --tp $tp \
-      --server-port $port \
-      $server_args"
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    bash -c "$server_command" &
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "lmdeploy server is up and running."
-    else
-      echo ""
-      echo "lmdeploy failed to start within the timeout period."
-      break
-    fi
-
-    # get model name
-    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend lmdeploy \
-        --tokenizer /tokenizer_cache \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        --model \"$model_name\" \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "lmdeploy" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-main() {
-
-  check_gpus
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  python -m pip install transformers==4.41.2
-
-  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  python -m pip install tabulate pandas
-  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@ -1,216 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  pkill text-generation || true
-  # waiting for GPU processes to be fully killed
-  sleep 10
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/generate_stream > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # append tgi to the test name
-    test_name=tgi_$test_name
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
-    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
-      echo "Key 'fp8' exists in common params."
-      server_command="/tgi-entrypoint.sh \
-        --model-id $model \
-        --num-shard $tp \
-        --port $port \
-        --quantize fp8 \
-        $server_args"
-    else
-      echo "Key 'fp8' does not exist in common params."
-      server_command="/tgi-entrypoint.sh \
-        --model-id $model \
-        --num-shard $tp \
-        --port $port \
-        $server_args"
-    fi
-
-
-    
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    eval "$server_command" &
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "tgi server is up and running."
-    else
-      echo ""
-      echo "tgi failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend tgi \
-        --model $model \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "tgi" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-main() {
-
-  check_gpus
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  export CURRENT_LLM_SERVING_ENGINE=tgi
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  python -m pip install tabulate pandas
-  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@ -1,214 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  pkill tritonserver || true
-  # waiting for GPU processes to be fully killed
-  sleep 20
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/generate_stream > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # append trt to the test name
-    test_name=trt_$test_name
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
-    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-
-
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-
-
-    echo "Running test case $test_name"
-    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "trt server is up and running."
-    else
-      echo ""
-      echo "trt failed to start within the timeout period."
-      break
-    fi
-
-    # prepare tokenizer
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend tensorrt-llm \
-        --tokenizer /tokenizer_cache \
-        --model $model \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      server_command=""
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "trt" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-main() {
-
-  check_gpus
-
-
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  # update transformers package, to make sure mixtral tokenizer is available
-  python -m pip install transformers -U
-
-  export CURRENT_LLM_SERVING_ENGINE=trt
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  python -m pip install tabulate pandas
-  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@ -1,221 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-kill_gpu_processes() {
-  # kill all processes on GPU.
-  pkill pt_main_thread
-  sleep 10
-
-  # remove vllm config file
-  rm -rf ~/.config/vllm
-
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # append vllm to the test name
-    test_name=vllm_$test_name
-
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
-    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
-      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
-      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-      server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
-        -tp $tp \
-        --model $model \
-        --port $port \
-        $server_args"
-    else
-      echo "Key 'fp8' does not exist in common params."
-      server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
-        -tp $tp \
-        --model $model \
-        --port $port \
-        $server_args"
-    fi
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    eval "$server_command" &
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "vllm server is up and running."
-    else
-      echo ""
-      echo "vllm failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --backend vllm \
-        --model $model \
-        --dataset-name $dataset_name \
-        --dataset-path $dataset_path \
-        --num-prompts $num_prompts \
-        --port $port \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "vllm" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-    rm -rf /root/.cache/huggingface/*
-  done
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-main() {
-
-  check_gpus
-  # enter vllm directory
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  export CURRENT_LLM_SERVING_ENGINE=vllm
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-
-  python3 -m pip install tabulate pandas
-  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -1,76 +0,0 @@
-import datetime
-import json
-import os
-from pathlib import Path
-
-import pandas as pd
-from tabulate import tabulate
-
-results_folder = Path("results/")
-
-# serving results and the keys that will be printed into markdown
-serving_results = []
-serving_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    "completed": "Successful req.",
-    "request_throughput": "Tput (req/s)",
-    "mean_ttft_ms": "Mean TTFT (ms)",
-    "std_ttft_ms": "Std TTFT (ms)",
-    "mean_itl_ms": "Mean ITL (ms)",
-    "std_itl_ms": "Std ITL (ms)",
-    "input_throughput": "Input Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
-    "engine": "Engine",
-}
-
-if __name__ == "__main__":
-
-    # collect results
-    for test_file in results_folder.glob("*.json"):
-
-        with open(test_file, "r") as f:
-            raw_result = json.loads(f.read())
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        serving_results.append(raw_result)
-        continue
-
-    serving_results = pd.DataFrame.from_dict(serving_results)
-
-    if not serving_results.empty:
-        serving_results = serving_results[list(
-            serving_column_mapping.keys())].rename(
-                columns=serving_column_mapping)
-
-    serving_md_table_with_headers = tabulate(serving_results,
-                                             headers='keys',
-                                             tablefmt='pipe',
-                                             showindex=False)
-    # remove the first line of header
-    serving_md_table_lines = serving_md_table_with_headers.split('\n')
-    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
-
-    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
-
-    # document benchmarking results in markdown
-    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
-        # document results with header.
-        # for those who wants to reproduce our benchmark.
-        f.write(serving_md_table_with_headers)
-        f.write('\n')
-
-    # document benchmarking results in json
-    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
-
-        results = serving_results.to_dict(orient='records')
-        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -1,17 +0,0 @@
-#!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
-URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
-
-retries=0
-while [ $retries -lt 1000 ]; do
-    if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
-        exit 0
-    fi
-
-    echo "Waiting for image to be available..."
-
-    retries=$((retries + 1))
-    sleep 5
-done
-
-exit 1
--- a/.buildkite/nightly-benchmarks/tests/descriptions.md
+++ b/.buildkite/nightly-benchmarks/tests/descriptions.md
@ -1,67 +0,0 @@
-
-## Latency tests
-
-This test suite aims to test vllm's end-to-end latency under a controlled setup.
-
- Input length: 32 tokens.
- Output length: 128 tokens.
- Batch size: fixed (8).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: end-to-end latency (mean, median, p99).
-
-### Latency benchmarking results
-
-{latency_tests_markdown_table}
-
-## Throughput tests
-
-This test suite aims to test vllm's throughput.
-
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput.
-
-### Throughput benchmarking results
-
-{throughput_tests_markdown_table}
-
-## Serving tests
-
-This test suite aims to test vllm's real serving metrics.
-
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
-
-### Serving benchmarking results
-
-{serving_tests_markdown_table}
-
-## json version of the benchmarking tables
-
-This section contains the data of the markdown tables above in JSON format. 
-You can load the benchmarking tables into pandas dataframes as follows:
-
-```python
-import json
-import pandas as pd
-
-benchmarking_results_json = """The json string"""
-benchmarking_results = json.loads(benchmarking_results_json)
-latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
-throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
-serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
-```
-
-The json string for all benchmarking tables:
-```json
-{benchmarking_results_in_json_string}
-```
-
-You can also check the raw experiment data in the Artifact tab of the Buildkite page.
-
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@ -1,32 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    },
-    {
-        "test_name": "latency_llama70B_tp4",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "num-iters-warmup": 5,
-            "num-iters": 15
-        }
-    },
-    {
-        "test_name": "latency_mixtral8x7B_tp2",
-        "parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "load_format": "dummy",
-            "num-iters-warmup": 5,
-            "num-iters": 15
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@ -1,116 +0,0 @@
-[
-    {
-        "test_name": "llama8B_tp1",
-        "qps_list": [4],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tp": 1,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000
-        },
-        "lmdeploy_server_parameters": {
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
-            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": ""
-        },
-        "vllm_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama70B_tp4",
-        "qps_list": [2],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tp": 4,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000
-        },
-        "lmdeploy_server_parameters": {
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
-            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": ""
-        },
-        "vllm_client_parameters": {
-        }
-    },
-    {
-        "test_name": "mixtral8x7B_tp2",
-        "qps_list": [2],
-        "common_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tp": 2,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000
-        },
-        "lmdeploy_server_parameters": {
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
-            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": ""
-        },
-        "vllm_client_parameters": {
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -1,80 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama70B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
-        "qps_list": [2],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "disable_log_requests": "", 
-            "tensor_parallel_size": 4,
-            "swap_space": 16, 
-            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
-            "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1,
-            "use_v2_block_manager": ""
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200 
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@ -1,35 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp1",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_llama70B_tp4",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_mixtral8x7B_tp2",
-        "parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    }
-]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,19 +0,0 @@
-steps:
-  - label: "Build wheel - CUDA {{matrix.cuda_version}}"
-    agents:
-      queue: cpu_queue
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
-    env:
-      DOCKER_BUILDKIT: "1"
-    matrix:
-      setup:
-        cuda_version:
-          - "11.8.0"
-          - "12.1.0"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -2,15 +2,6 @@
 set -ex

 # Print ROCm version
-echo "--- Confirming Clean Initial State"
-while true; do
-        sleep 3
-        if grep -q clean /opt/amdgpu/etc/gpu_state; then
-                echo "GPUs state is \"clean\""
-                break
-        fi
-done
-
 echo "--- ROCm info"
 rocminfo

@ -54,10 +45,15 @@ while true; do
        fi
 done

-echo "--- Pulling container" 
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
-container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-docker pull ${image_name}
+echo "--- Building container"
+sha=$(git rev-parse --short HEAD)
+image_name=rocm_${sha}
+container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
+docker build \
+        -t ${image_name} \
+        -f Dockerfile.rocm \
+        --progress plain \
+        .

 remove_docker_container() {
   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
@ -66,18 +62,11 @@ trap remove_docker_container EXIT

 echo "--- Running container"

-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p ${HF_CACHE}
-HF_MOUNT="/root/.cache/huggingface"
-
 docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
-        --shm-size=16gb \
        --rm \
        -e HF_TOKEN \
-        -v ${HF_CACHE}:${HF_MOUNT} \
-        -e HF_HOME=${HF_MOUNT} \
        --name ${container_name} \
        ${image_name} \
        /bin/bash -c "${@}"
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
 echo '```' >> benchmark_results.md
-tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
+tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
 echo '```' >> benchmark_results.md

 # if the agent binary is not found, skip uploading the results, exit 0
-if [ ! -f /usr/bin/buildkite-agent ]; then
+if [ ! -f /workspace/buildkite-agent ]; then
    exit 0
 fi

 # upload the results to buildkite
-buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

 # exit with the exit code of the benchmarks
 if [ $bench_latency_exit_code -ne 0 ]; then
@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
 fi

 rm ShareGPT_V3_unfiltered_cleaned_split.json
-buildkite-agent artifact upload "*.json"
+/workspace/buildkite-agent artifact upload "*.json"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -3,38 +3,12 @@
 set -ex

 # Try building the docker image
-numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+docker build -t cpu-test -f Dockerfile.cpu .

 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
+remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
-
-# offline inference
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
-
-# Run basic model test
-docker exec cpu-test bash -c "
-  pip install pytest Pillow protobuf
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
-
-# online inference
-docker exec cpu-test bash -c "
-  export VLLM_CPU_KVCACHE_SPACE=10 
-  export VLLM_CPU_OMP_THREADS_BIND=48-92 
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
-  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-  python3 benchmarks/benchmark_serving.py \
-    --backend vllm \
-    --dataset-name random \
-    --model facebook/opt-125m \
-    --num-prompts 20 \
-    --endpoint /v1/completions \
-    --tokenizer facebook/opt-125m"
+# Run the image and launch offline inference
+docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@ -1,105 +0,0 @@
-#!/bin/bash
-
-set -euox pipefail
-
-if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
-    exit 1
-fi
-
-WORKING_DIR=$1
-NUM_NODES=$2
-NUM_GPUS=$3
-DOCKER_IMAGE=$4
-
-shift 4
-COMMANDS=("$@")
-if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
-    echo "The number of commands must be equal to the number of nodes."
-    echo "Number of nodes: $NUM_NODES"
-    echo "Number of commands: ${#COMMANDS[@]}"
-    exit 1
-fi
-
-echo "List of commands"
-for command in "${COMMANDS[@]}"; do
-    echo $command
-done
-
-start_network() {
-    docker network create --subnet=192.168.10.0/24 docker-net
-}
-
-start_nodes() {
-    for node in $(seq 0 $(($NUM_NODES-1))); do
-        GPU_DEVICES='"device='
-        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
-            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
-            GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
-                GPU_DEVICES+=','
-            fi
-        done
-        GPU_DEVICES+='"'
-
-        # start the container in detached mode
-        # things to note:
-        # 1. --shm-size=10.24gb is required. don't use --ipc=host
-        # 2. pass HF_TOKEN to the container
-        # 3. map the huggingface cache directory to the container
-        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
-        #    starting from 192.168.10.11)
-        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
-
-        # organize containers into a ray cluster
-        if [ $node -eq 0 ]; then
-            # start the ray head node
-            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
-            # wait for the head node to be ready
-            sleep 10
-        else
-            # start the ray worker nodes, and connect them to the head node
-            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
-        fi
-    done
-
-    # wait for the cluster to be ready
-    sleep 10
-
-    # print the cluster status
-    docker exec node0 /bin/bash -c "ray status"
-}
-
-run_nodes() {
-    # important: iterate in reverse order to start the head node last
-    # we start the worker nodes first, in detached mode, and then start the head node
-    # in the foreground, so that the output of the head node is visible in the buildkite logs
-    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
-        GPU_DEVICES='"device='
-        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
-            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
-            GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
-                GPU_DEVICES+=','
-            fi
-        done
-        GPU_DEVICES+='"'
-        echo "Running node$node with GPU devices: $GPU_DEVICES"
-        if [ $node -ne 0 ]; then
-            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
-        else
-            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
-        fi
-    done
-}
-cleanup() {
-    for node in $(seq 0 $(($NUM_NODES-1))); do
-        docker stop node$node
-    done
-    docker network rm docker-net
-}
-trap cleanup EXIT
-start_network
-start_nodes
-run_nodes
-
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -1,14 +0,0 @@
-# This script build the OpenVINO docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t openvino-test -f Dockerfile.openvino .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f openvino-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -1,16 +0,0 @@
-set -e
-
-# Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
-
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
-trap remove_docker_container EXIT
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# For HF_TOKEN.
-source /etc/environment
-# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
-    python3 /workspace/vllm/examples/offline_inference_tpu.py
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -1,14 +0,0 @@
-# This script build the CPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t xpu-test -f Dockerfile.xpu .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f xpu-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -1,37 +1,11 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-
-# This script will be feed into Jinja template in `test-template-aws.j2` at
-# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
-# to generate the final pipeline yaml file.
-
+# This script will be feed into Jinja template in `test-template.j2` to generate
+# the final pipeline yaml file.

 steps:
- label: Async Engine, Inputs, Utils, Worker Test
-  fast_check: true
-  fast_check_only: true
-  commands:
-  - pytest -v -s async_engine # Async Engine
-  - pytest -v -s test_inputs.py
-  - pytest -v -s multimodal
-  - pytest -v -s test_utils.py # Utils
-  - pytest -v -s worker # Worker
-
- label: Metrics, Tracing Test
-  fast_check: true
-  fast_check_only: true
-  commands:
-  - pytest -v -s metrics # Metrics
-  - "pip install \
-      opentelemetry-sdk \
-      opentelemetry-api \
-      opentelemetry-exporter-otlp \
-      opentelemetry-semantic-conventions-ai" # Tracing
-  - pytest -v -s tracing
-
 - label: Regression Test
  mirror_hardwares: [amd]
-  fast_check: true
  command: pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

@ -41,120 +15,86 @@ steps:

 - label: Basic Correctness Test
  mirror_hardwares: [amd]
-  fast_check: true
  commands:
-  # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true
-  - pytest -v -s basic_correctness/test_basic_correctness.py
-  - pytest -v -s basic_correctness/test_cpu_offload.py
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

 - label: Core Test
  mirror_hardwares: [amd]
-  fast_check: true
-  commands:
-  - pytest -v -s core
+  command: pytest -v -s core

 - label: Distributed Comm Ops Test
  #mirror_hardwares: [amd]
+  command: pytest -v -s distributed/test_comm_ops.py
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
-  commands:
-  - pytest -v -s distributed/test_comm_ops.py
-  - pytest -v -s distributed/test_shm_broadcast.py

- label: 2 Node Tests (4 GPUs in total)
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  num_nodes: 2
-  commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-
- label: Distributed Tests (2 GPUs)
+- label: Distributed Tests
  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  commands:
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s distributed/test_multimodal_broadcast.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist.py 

- label: Distributed Tests (4 GPUs)
+- label: Distributed Tests (Multiple Groups)
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
-  fast_check: true
  commands:
  - pytest -v -s distributed/test_pynccl.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
-
- label: Pipeline Parallelism Test
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  commands:
-  - pytest -v -s distributed/test_pipeline_parallel.py

 - label: Engine Test
  mirror_hardwares: [amd]
-  commands:
-  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization
+  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py

 - label: Entrypoints Test
-  fast_check: true
  mirror_hardwares: [amd]

  commands:
-  - pytest -v -s entrypoints/llm
-  - pytest -v -s entrypoints/openai
+  - pytest -v -s test_inputs.py
+  - pytest -v -s entrypoints -m llm
+  - pytest -v -s entrypoints -m openai

 - label: Examples Test
  working_dir: "/vllm-workspace/examples"
  mirror_hardwares: [amd]
  commands:
+    # install aws cli for llava_example.py
    # install tensorizer for tensorize_vllm_model.py
    - pip install awscli tensorizer
    - python3 offline_inference.py
-    - python3 cpu_offload.py
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
-    - python3 offline_inference_vision_language.py
+    - python3 llava_example.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors

- label: Inputs Test
+- label: Kernels Test %N
  #mirror_hardwares: [amd]
-  commands:
-    - pytest -v -s test_inputs.py
-    - pytest -v -s multimodal
-
-# - label: Kernels Test %N
-#   #mirror_hardwares: [amd]
-#   commands:
-#     - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-#     - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-#   parallelism: 4
+  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4

 - label: Models Test
  #mirror_hardwares: [amd]
  commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
-    - pytest -v -s models -m \"not vlm\"
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s models --ignore=models/test_llava.py

- label: Vision Language Models Test
+- label: Llava Test
  mirror_hardwares: [amd]
  commands:
-    - pytest -v -s models -m vlm
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s models/test_llava.py

 - label: Prefix Caching Test
  mirror_hardwares: [amd]
@ -170,9 +110,7 @@ steps:
  command: pytest -v -s test_logits_processor.py

 - label: Utils Test
-  commands:
-    - pytest -v -s test_utils.py
-    - pytest -v -s test_embedded_commit.py
+  command: pytest -v -s test_utils.py

 - label: Worker Test
  mirror_hardwares: [amd]
@ -180,33 +118,30 @@ steps:

 - label: Speculative decoding tests
  #mirror_hardwares: [amd]
+  command: pytest -v -s spec_decode
+
+- label: LoRA Test %N
+  #mirror_hardwares: [amd]
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+  parallelism: 4
+
+- label: LoRA Long Context (Distributed)
+  #mirror_hardwares: [amd]
+  num_gpus: 4
+  # This test runs llama 13B, so it is required to run on 4 GPUs.
  commands:
-    # See https://github.com/vllm-project/vllm/issues/5152
-    - export VLLM_ATTENTION_BACKEND=XFORMERS
-    - pytest -v -s spec_decode
-
-# - label: LoRA Test %N
-#   #mirror_hardwares: [amd]
-#   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
-#   parallelism: 4
-
-# - label: LoRA Long Context (Distributed)
-#   #mirror_hardwares: [amd]
-#   num_gpus: 4
-#   # This test runs llama 13B, so it is required to run on 4 GPUs.
-#   commands:
-#     # FIXIT: find out which code initialize cuda before running the test
-#     # before the fix, we need to use spawn to test it
-#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - pytest -v -s -x lora/test_long_context.py
+    # Temporarily run this way because we cannot clean up GPU mem usage
+    # for multi GPU tests.
+    # TODO(sang): Fix it.
+    - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
+    - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
+    - pytest -v -s lora/test_long_context.py::test_self_consistency
+    - pytest -v -s lora/test_long_context.py::test_quality
+    - pytest -v -s lora/test_long_context.py::test_max_len

 - label: Tensorizer Test
  #mirror_hardwares: [amd]
-  fast_check: true
-  commands:
-    - apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s tensorizer_loader
+  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader

 - label: Metrics Test
  mirror_hardwares: [amd]
@ -216,15 +151,6 @@ steps:
  #mirror_hardwares: [amd]
  command: pytest -v -s quantization

- label: Tracing Test
-  commands: 
-    - "pip install \
-        opentelemetry-sdk \
-        opentelemetry-api \
-        opentelemetry-exporter-otlp \
-        opentelemetry-semantic-conventions-ai"
-    - pytest -v -s tracing
-
 - label: Benchmarks
  working_dir: "/vllm-workspace/.buildkite"
  mirror_hardwares: [amd]
@ -232,37 +158,9 @@ steps:
  - pip install aiohttp
  - bash run-benchmarks.sh

- label: LM Eval Small Models
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  commands:
-  - pip install lm-eval
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-small.txt -t 1
-
- label: LM Eval Large Models
-  gpu: a100
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  commands:
-  - pip install lm-eval
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-large.txt -t 4
-
 - label: Documentation Build
  working_dir: "/vllm-workspace/test_docs/docs"
-  fast_check: true
  no_gpu: True
  commands:
  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html
-
- label: Distributed Tests (A100)
-  gpu: a100
-  num_gpus: 4
-  commands: 
-  # NOTE: don't test llama model here, it seems hf implementation is buggy
-  # see https://github.com/vllm-project/vllm/pull/5689 for details
-  - pytest -v -s distributed/test_custom_all_reduce.py
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
-  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pytest -v -s -x lora/test_mixtral.py
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@ -0,0 +1,93 @@
+{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
+{% set default_num_gpu = 1 %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    commands: 
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+  - wait
+
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+    {% endif %}
+    {% endfor %}
+
+  - label: "Neuron Test"
+    depends_on: ~
+    agents:
+      queue: neuron
+    command: bash .buildkite/run-neuron-test.sh
+    soft_fail: true
+
+  - label: "Intel Test"
+    depends_on: ~
+    command: bash .buildkite/run-cpu-test.sh
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      queue: kubernetes
+    soft_fail: {{ step.soft_fail or false }}
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+      - kubernetes:
+          podSpec:
+            {% if step.num_gpus %}
+            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
+            {% endif %}
+            volumes:
+              - name: dshm
+                emptyDir:
+                  medium: Memory
+            containers:
+              - image: "{{ docker_image }}"
+                command: ["bash"]
+                args:
+                - '-c'
+                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                {% if not step.no_gpu %}
+                resources:
+                  requests:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                  limits:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                {% endif %}
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+  {% endfor %}
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -1,2 +0,0 @@
-github: [vllm-project]
-open_collective: [vllm]
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -1,21 +0,0 @@
-name: Add label on auto-merge enabled
-on:
-    pull_request_target:
-        types:
-            - auto_merge_enabled
-jobs:
-    add-label-on-auto-merge:
-        runs-on: ubuntu-latest
-        steps:
-            -   name: Add label
-                uses: actions/github-script@v5
-                with:
-                    script: |
-                        github.rest.issues.addLabels({
-                            owner: context.repo.owner,
-                            repo: context.repo.repo,
-                            issue_number: context.issue.number,
-                            labels: ['ready']
-                        })
-                env:
-                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/add_label_ready_comment.yml
+++ b/.github/workflows/add_label_ready_comment.yml
@ -1,23 +0,0 @@
-name: Add Ready Label on Ready Comment
-
-on:
-  issue_comment:
-    types: [created]
-
-jobs:
-  add-ready-label:
-    runs-on: ubuntu-latest
-    if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
-    steps:
-        -   name: Add label
-            uses: actions/github-script@v5
-            with:
-                script: |
-                    github.rest.issues.addLabels({
-                        owner: context.repo.owner,
-                        repo: context.repo.repo,
-                        issue_number: context.issue.number,
-                        labels: ['ready']
-                    })
-            env:
-                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@ -30,6 +30,12 @@ jobs:
      run: |
        EXCLUDES=(
            'csrc/moe/topk_softmax_kernels.cu'
+            'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+            'csrc/punica/bgmv/bgmv_config.h'
+            'csrc/punica/bgmv/bgmv_impl.cuh'
+            'csrc/punica/bgmv/vec_dtypes.cuh'
+            'csrc/punica/punica_ops.cu'
+            'csrc/punica/type_convert.h'
        )
        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
@ -32,17 +32,19 @@ jobs:
        pip install types-setuptools
    - name: Mypy
      run: |
-        mypy
-        mypy tests --follow-imports skip
-        mypy vllm/attention --follow-imports skip
-        mypy vllm/core --follow-imports skip
-        mypy vllm/distributed --follow-imports skip
-        mypy vllm/engine  --follow-imports skip
-        mypy vllm/entrypoints --follow-imports skip
-        mypy vllm/executor --follow-imports skip
-        mypy vllm/lora --follow-imports skip
-        mypy vllm/model_executor  --follow-imports skip
-        mypy vllm/prompt_adapter --follow-imports skip
-        mypy vllm/spec_decode --follow-imports skip
-        mypy vllm/worker --follow-imports skip
+        mypy vllm/attention --config-file pyproject.toml
+        mypy vllm/core --config-file pyproject.toml
+        mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/entrypoints --config-file pyproject.toml
+        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/transformers_utils --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/worker --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
+        mypy vllm/model_executor  --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+        mypy vllm/logging --config-file pyproject.toml
+        mypy vllm/model_executor --config-file pyproject.toml

--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -48,8 +48,8 @@ jobs:
      fail-fast: false
      matrix:
          os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
+          python-version: ['3.8', '3.9', '3.10', '3.11']
+          pytorch-version: ['2.3.0']  # Must be the most recent version that meets requirements-cuda.txt.
          cuda-version: ['11.8', '12.1']

    steps:
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -1,21 +0,0 @@
-name: PR Reminder Comment Bot
-on:
-  pull_request_target:
-    types: [opened]
-
-jobs:
-  pr_reminder:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Remind to run full CI on PR
-        uses: actions/github-script@v6
-        with:
-          script: |
-            github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
-            })
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/remove_label_not_ready_comment.yml
+++ b/.github/workflows/remove_label_not_ready_comment.yml
@ -1,23 +0,0 @@
-name: Remove ready Label on notready Comment
-
-on:
-  issue_comment:
-    types: [created]
-
-jobs:
-  add-ready-label:
-    runs-on: ubuntu-latest
-    if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
-    steps:
-        -   name: Remove ready label
-            uses: actions/github-script@v5
-            with:
-                script: |
-                    github.rest.issues.removeLabel({
-                        owner: context.repo.owner,
-                        repo: context.repo.repo,
-                        issue_number: context.issue.number,
-                        name: 'ready'
-                    })
-            env:
-                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
@ -25,7 +25,7 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
+        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
    - name: Analysing the code with ruff
      run: |
        ruff .
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -13,6 +13,8 @@ $python_executable -m pip install -r requirements-cuda.txt

 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
+# Make sure punica is built for the release (for LoRA)
+export VLLM_INSTALL_PUNICA_KERNELS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,3 @@
-# vllm commit id, generated by setup.py
-vllm/commit_id.py
-
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -10,7 +10,6 @@ build:

 sphinx:
   configuration: docs/source/conf.py
-   fail_on_warning: true

 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,8 +2,7 @@ cmake_minimum_required(VERSION 3.21)

 project(vllm_extensions LANGUAGES CXX)

-# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
-set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
+option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")

 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@ -14,7 +13,7 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")

 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
@ -32,8 +31,9 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
+set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
+set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")

 #
 # Try to find python package with an executable that exactly matches
@ -67,37 +67,17 @@ endif()
 find_package(Torch REQUIRED)

 #
-# Add the `default` target which detects which extensions should be
-# built based on platform/architecture.  This is the same logic that
-# setup.py uses to select which extensions should be built and should
-# be kept in sync.
+# Normally `torch.utils.cpp_extension.CUDAExtension` would add
+# `libtorch_python.so` for linking against an extension. Torch's cmake
+# configuration does not include this library (presumably since the cmake
+# config is used for standalone C++ binaries that link against torch).
+# The `libtorch_python.so` library defines some of the glue code between
+# torch/python via pybind and is required by VLLM extensions for this
+# reason. So, add it by manually with `find_library` using torch's
+# installed library path.
 #
-# The `default` target makes direct use of cmake easier since knowledge
-# of which extensions are supported has been factored in, e.g.
-#
-# mkdir build && cd build
-# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
-# cmake --build . --target default
-#
-add_custom_target(default)
-message(STATUS "Enabling core extension.")
-
-# Define _core_C extension
-#  built for (almost) every target platform, (excludes TPU and Neuron)
-
-set(VLLM_EXT_SRC
-  "csrc/core/torch_bindings.cpp")
-
-define_gpu_extension_target(
-  _core_C
-  DESTINATION vllm
-  LANGUAGE CXX
-  SOURCES ${VLLM_EXT_SRC}
-  COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-  USE_SABI 3
-  WITH_SOABI)
-
-add_dependencies(default _core_C)
+find_library(torch_python_LIBRARY torch_python PATHS
+  "${TORCH_INSTALL_PREFIX}/lib")

 #
 # Forward the non-CUDA device extensions to external CMake scripts.
@ -107,7 +87,7 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
    else()
-        return()
+        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
    endif()
    return()
 endif()
@ -131,11 +111,18 @@ elseif(HIP_FOUND)
  # .hip extension automatically, HIP must be enabled explicitly.
  enable_language(HIP)

-  # ROCm 5.X and 6.X
-  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
-    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
-      "expected for ROCm build, saw ${Torch_VERSION} instead.")
+  # ROCm 5.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
+      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
+  endif()
+
+  # ROCm 6.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
+      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
  endif()
 else()
  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
@ -165,7 +152,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
 endif()

 #
-# Define other extension targets
+# Define extension targets
 #

 #
@ -184,18 +171,16 @@ set(VLLM_EXT_SRC
  "csrc/quantization/fp8/common.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/moe_align_block_size_kernels.cu"
-  "csrc/prepare_inputs/advance_step.cu"
-  "csrc/torch_bindings.cpp")
+  "csrc/pybind.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
  include(FetchContent)
-  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+  SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
  FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        # CUTLASS 3.5.1
-        GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9 
-        GIT_PROGRESS TRUE
+        # CUTLASS 3.5.0
+        GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
  )
  FetchContent_MakeAvailable(cutlass)

@ -204,15 +189,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
    "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-    "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-    "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
-    "csrc/quantization/fp8/fp8_marlin.cu"
    "csrc/custom_all_reduce.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")

  #
  # The CUTLASS kernels for Hopper require sm90a to be enabled.
@ -220,7 +202,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
    set_source_files_properties(
-          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+          "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
          PROPERTIES
          COMPILE_FLAGS
          "-gencode arch=compute_90a,code=sm_90a")
@ -235,8 +217,7 @@ define_gpu_extension_target(
  SOURCES ${VLLM_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
-  USE_SABI 3
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  WITH_SOABI)

 #
@ -244,7 +225,7 @@ define_gpu_extension_target(
 #

 set(VLLM_MOE_EXT_SRC
-  "csrc/moe/torch_bindings.cpp"
+  "csrc/moe/moe_ops.cpp"
  "csrc/moe/topk_softmax_kernels.cu")

 define_gpu_extension_target(
@ -254,16 +235,93 @@ define_gpu_extension_target(
  SOURCES ${VLLM_MOE_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  USE_SABI 3
  WITH_SOABI)

+#
+# _punica_C extension
+#

+set(VLLM_PUNICA_EXT_SRC
+  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
+  "csrc/punica/punica_ops.cu"
+  "csrc/punica/punica_pybind.cpp")
+
+#
+# Copy GPU compilation flags+update for punica
+#
+set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
+  "-D__CUDA_NO_HALF_OPERATORS__"
+  "-D__CUDA_NO_HALF_CONVERSIONS__"
+  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+  "-D__CUDA_NO_HALF2_OPERATORS__")
+
+#
+# Filter out CUDA architectures < 8.0 for punica.
+#
+if (${VLLM_GPU_LANG} STREQUAL "CUDA")
+  set(VLLM_PUNICA_GPU_ARCHES)
+  foreach(ARCH ${VLLM_GPU_ARCHES})
+    string_to_ver(CODE_VER ${ARCH})
+    if (CODE_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
+    endif()
+  endforeach()
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
+  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+endif()
+
+if (VLLM_PUNICA_GPU_ARCHES)
+  define_gpu_extension_target(
+    _punica_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_PUNICA_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
+    WITH_SOABI)
+else()
+  message(WARNING "Unable to create _punica_C target because none of the "
+    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
+endif()
+
+#
+# Add the `default` target which detects which extensions should be
+# built based on platform/architecture.  This is the same logic that
+# setup.py uses to select which extensions should be built and should
+# be kept in sync.
+#
+# The `default` target makes direct use of cmake easier since knowledge
+# of which extensions are supported has been factored in, e.g.
+#
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
+# cmake --build . --target default
+#
+add_custom_target(default)

 if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
  message(STATUS "Enabling C extension.")
  add_dependencies(default _C)

+  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
+  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
+  # there are supported target arches.
+  if (VLLM_PUNICA_GPU_ARCHES AND
+      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
+    message(STATUS "Enabling punica extension.")
+    add_dependencies(default _punica_C)
+  endif()
+endif()
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
  message(STATUS "Enabling moe extension.")
  add_dependencies(default _moe_C)
-
 endif()
--- a/136
+++ b/136
@ -5,51 +5,31 @@
 # docs/source/dev/dockerfile/dockerfile.rst and
 # docs/source/assets/dev/dockerfile-stages-dependency.png

-ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
-
-ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.10
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
-    && python3 --version
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev

 RUN apt-get update -y \
-    && apt-get install -y git curl sudo
-
-# Install pip s.t. it will be compatible with our PYTHON_VERSION
-RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
-RUN python3 -m pip --version
+    && apt-get install -y python3-pip git

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/

 WORKDIR /workspace

 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
-COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-cuda.txt
+    pip install -r requirements-cuda.txt

-COPY requirements-mamba.txt requirements-mamba.txt
-RUN python3 -m pip install packaging
-RUN python3 -m pip install -r requirements-mamba.txt
+# install development dependencies
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-dev.txt

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -59,16 +39,14 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################

-#################### WHEEL BUILD IMAGE ####################
-FROM base AS build

-ARG PYTHON_VERSION=3.10
+#################### WHEEL BUILD IMAGE ####################
+FROM dev AS build

 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
-
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-build.txt
+    pip install -r requirements-build.txt

 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
@ -79,7 +57,6 @@ COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
-COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm vllm
@ -90,37 +67,13 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
-
-ARG buildkite_commit
-ENV BUILDKITE_COMMIT=${buildkite_commit}
-
-ARG USE_SCCACHE
-# if USE_SCCACHE is set, use sccache to speed up compilation
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..." \
-        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
-        && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
-        && if [ "$CUDA_VERSION" = "11.8.0" ]; then \
-            export SCCACHE_BUCKET=vllm-build-sccache-2; \
-           else \
-            export SCCACHE_BUCKET=vllm-build-sccache; \
-           fi \
-        && export SCCACHE_REGION=us-west-2 \
-        && export CMAKE_BUILD_TYPE=Release \
-        && sccache --show-stats \
-        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
-        && sccache --show-stats; \
-    fi
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1

 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
-    if [ "$USE_SCCACHE" != "1" ]; then \
-        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
-    fi
+    python3 setup.py bdist_wheel --dist-dir=dist

 # check the size of the wheel, we cannot upload wheels larger than 100MB
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
@ -128,73 +81,24 @@ RUN python3 check-wheel-size.py dist

 #################### EXTENSION Build IMAGE ####################

-#################### DEV IMAGE ####################
-FROM base as dev
-
-COPY requirements-lint.txt requirements-lint.txt
-COPY requirements-test.txt requirements-test.txt
-COPY requirements-dev.txt requirements-dev.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-dev.txt
-
-#################### DEV IMAGE ####################
-#################### MAMBA Build IMAGE ####################
-FROM dev as mamba-builder
-# max jobs used for build
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-
-WORKDIR /usr/src/mamba
-
-COPY requirements-mamba.txt requirements-mamba.txt
-
-# Download the wheel or build it if a pre-compiled release doesn't exist
-RUN pip --verbose wheel -r requirements-mamba.txt \
-    --no-build-isolation --no-deps --no-cache-dir
-
-#################### MAMBA Build IMAGE ####################
-
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
-ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.10
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
 WORKDIR /vllm-workspace

-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
-    && python3 --version
-
 RUN apt-get update -y \
-    && apt-get install -y python3-pip git vim curl libibverbs-dev
-
-# Install pip s.t. it will be compatible with our PYTHON_VERSION
-RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
-RUN python3 -m pip --version
+    && apt-get install -y python3-pip git vim

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/

 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install dist/*.whl --verbose
-
-RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
-    --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
+    pip install dist/*.whl --verbose
 #################### vLLM installation IMAGE ####################


@ -207,7 +111,7 @@ ADD . /vllm-workspace/

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-dev.txt
+    pip install -r requirements-dev.txt

 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
@ -224,7 +128,7 @@ FROM vllm-base AS vllm-openai

 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+    pip install accelerate hf_transfer modelscope

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -1,26 +1,13 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.

-FROM ubuntu:22.04 AS cpu-test-1
+FROM ubuntu:22.04

-RUN apt-get update -y \
-    && apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+RUN apt-get update  -y \
+    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

-# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
-# intel-openmp provides additional performance improvement vs. openmp
-# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN pip install intel-openmp
-
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
-
-RUN echo 'ulimit -c 0' >> ~/.bashrc
-
-RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
-
 RUN pip install --upgrade pip \
-    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
-
-FROM cpu-test-1 AS build
+    && pip install wheel packaging ninja setuptools>=49.4.0 numpy

 COPY ./ /workspace/vllm

@ -28,14 +15,8 @@ WORKDIR /workspace/vllm

 RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu

-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

 WORKDIR /workspace/

-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+CMD ["/bin/bash"]
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -28,7 +28,7 @@ COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
 RUN cd /app/vllm \
    && python3 -m pip install -U -r requirements-neuron.txt

-ENV VLLM_TARGET_DEVICE neuron
+ENV VLLM_BUILD_WITH_NEURON 1
 RUN cd /app/vllm \
    && pip install -e . \
    && cd ..
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -1,29 +0,0 @@
-# The vLLM Dockerfile is used to construct vLLM image that can be directly used
-# to run the OpenAI compatible server.
-
-FROM ubuntu:22.04 AS dev
-
-RUN apt-get update -y && \
-    apt-get install -y python3-pip git
-WORKDIR /workspace
-
-# copy requirements
-COPY requirements-build.txt /workspace/vllm/
-COPY requirements-common.txt /workspace/vllm/
-COPY requirements-openvino.txt /workspace/vllm/
-
-COPY vllm/ /workspace/vllm/vllm
-COPY csrc/core /workspace/vllm/csrc/core
-COPY cmake/utils.cmake /workspace/vllm/cmake/
-COPY CMakeLists.txt /workspace/vllm/
-COPY setup.py /workspace/vllm/
-
-# install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
-# build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
-
-COPY examples/ /workspace/vllm/examples
-COPY benchmarks/ /workspace/vllm/benchmarks
-
-CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -1,22 +0,0 @@
-FROM mambaorg/micromamba
-ARG MAMBA_DOCKERFILE_ACTIVATE=1
-USER root
-
-RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-# Some packages in requirements-cpu are installed here
-# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
-# Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes
-
-COPY ./ /workspace/vllm
-
-WORKDIR /workspace/vllm
-
-# These packages will be in rocketce eventually
-RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
-
-RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
-
-WORKDIR /vllm-workspace
-ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -1,33 +1,35 @@
-# Default ROCm 6.1 base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+# default base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+FROM $BASE_IMAGE
+
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"

-# Default ROCm ARCHes to build vLLM for.
-ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"

-# Whether to install CK-based flash-attention
-# If 0, will not install flash-attention
-ARG BUILD_FA="1"
-# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
-# If this succeeds, we use the downloaded wheel and skip building flash-attention.
-# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
-# architectures specified in `FA_GFX_ARCHS`
-ARG TRY_FA_WHEEL="1"
-ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-ARG FA_BRANCH="23a2b1c2"
+RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"

-# Whether to build triton on rocm
+ARG FA_BRANCH="ae7928c"
+RUN echo "FA_BRANCH is $FA_BRANCH"
+
+# whether to build flash-attention
+# if 0, will not build flash attention
+# this is useful for gfx target where flash-attention is not supported
+# In that case, we need to use the python reference attention implementation in vllm
+ARG BUILD_FA="1"
+
+# whether to build triton on rocm
 ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="e0fc12c"
-
-### Base image build stage
-FROM $BASE_IMAGE AS base
-
-# Import arg(s) defined before this build stage
-ARG PYTORCH_ROCM_ARCH

 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
+
+# Install some basic utilities
 RUN apt-get update && apt-get install -y \
    curl \
    ca-certificates \
@ -38,144 +40,75 @@ RUN apt-get update && apt-get install -y \
    build-essential \
    wget \
    unzip \
+    nvidia-cuda-toolkit \
    tmux \
-    ccache \
 && rm -rf /var/lib/apt/lists/*

-# When launching the container, mount the code directory to /vllm-workspace
+### Mount Point ###
+# When launching the container, mount the code directory to /app
 ARG APP_MOUNT=/vllm-workspace
+VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}

 RUN python3 -m pip install --upgrade pip
-# Remove sccache so it doesn't interfere with ccache
-# TODO: implement sccache support across components
-RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
-# Install torch == 2.5.0 on ROCm
-RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.1"*) \
-            python3 -m pip uninstall -y torch torchvision \
-            && python3 -m pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240726 \
-                torchvision==0.20.0.dev20240726 \
-               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
-        *) ;; esac
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas

 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
 ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
 ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:

-ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
-ENV CCACHE_DIR=/root/.cache/ccache
-
-
-### AMD-SMI build stage
-FROM base AS build_amdsmi
-# Build amdsmi wheel always
-RUN cd /opt/rocm/share/amd_smi \
-    && python3 -m pip wheel . --wheel-dir=/install
-
-
-### Flash-Attention wheel build stage
-FROM base AS build_fa
-ARG BUILD_FA
-ARG TRY_FA_WHEEL
-ARG FA_WHEEL_URL
-ARG FA_GFX_ARCHS
-ARG FA_BRANCH
-# Build ROCm flash-attention wheel if `BUILD_FA = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    if [ "$BUILD_FA" = "1" ]; then \
-        if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
-            # If a suitable wheel exists, we download it instead of building FA
-            mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
-        else \
-            mkdir -p libs \
-            && cd libs \
-            && git clone https://github.com/ROCm/flash-attention.git \
-            && cd flash-attention \
-            && git checkout "${FA_BRANCH}" \
-            && git submodule update --init \
-            && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
-        fi; \
-    # Create an empty directory otherwise as later build stages expect one
-    else mkdir -p /install; \
+# Install ROCm flash-attention
+RUN if [ "$BUILD_FA" = "1" ]; then \
+    mkdir libs \
+    && cd libs \
+    && git clone https://github.com/ROCm/flash-attention.git \
+    && cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && export GPU_ARCHS=${FA_GFX_ARCHS} \
+    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
+    && python3 setup.py install \
+    && cd ..; \
    fi

+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually removed it so that later steps of numpy upgrade can continue
+RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi

-### Triton wheel build stage
-FROM base AS build_triton
-ARG BUILD_TRITON
-ARG TRITON_BRANCH
-# Build triton wheel if `BUILD_TRITON = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    if [ "$BUILD_TRITON" = "1" ]; then \
+# build triton
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
    mkdir -p libs \
    && cd libs \
-    && git clone https://github.com/OpenAI/triton.git \
-    && cd triton \
-    && git checkout "${TRITON_BRANCH}" \
-    && cd python \
-    && python3 setup.py bdist_wheel --dist-dir=/install; \
-    # Create an empty directory otherwise as later build stages expect one
-    else mkdir -p /install; \
+    && pip uninstall -y triton \
+    && git clone https://github.com/ROCm/triton.git \
+    && cd triton/python \
+    && pip3 install . \
+    && cd ../..; \
    fi

-
-### Final vLLM build stage
-FROM base AS final
-# Import the vLLM development directory from the build context
+WORKDIR /vllm-workspace
 COPY . .

-# Package upgrades for useful functionality or to avoid dependency issues
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
-
+#RUN python3 -m pip install pynvml # to be removed eventually
+RUN python3 -m pip install --upgrade pip numba

+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-# Silences the HF Tokenizers warning
-ENV TOKENIZERS_PARALLELISM=false

-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -Ur requirements-rocm.txt \
-    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.1"*) \
-            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
-            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
-            # Prevent interference if torch bundles its own HIP runtime
-            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
-        *) ;; esac \
-    && python3 setup.py clean --all \
-    && python3 setup.py develop
+ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so

-# Copy amdsmi wheel into final image
-RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
-    mkdir -p libs \
-    && cp /install/*.whl libs \
-    # Preemptively uninstall to avoid same-version no-installs
-    && python3 -m pip uninstall -y amdsmi;
-
-# Copy triton wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
-    mkdir -p libs \
-    && if ls /install/*.whl; then \
-        cp /install/*.whl libs \
-        # Preemptively uninstall to avoid same-version no-installs
-        && python3 -m pip uninstall -y triton; fi
-
-# Copy flash-attn wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
-    mkdir -p libs \
-    && if ls /install/*.whl; then \
-        cp /install/*.whl libs \
-        # Preemptively uninstall to avoid same-version no-installs
-        && python3 -m pip uninstall -y flash-attn; fi
-
-# Install wheels that were built to the final image
 RUN --mount=type=cache,target=/root/.cache/pip \
-    if ls libs/*.whl; then \
-    python3 -m pip install libs/*.whl; fi
+    pip install -U -r requirements-rocm.txt \
+    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
+    && python3 setup.py install \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cd ..
+

 CMD ["/bin/bash"]
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@ -1,23 +0,0 @@
-ARG NIGHTLY_DATE="20240726"
-ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
-
-FROM $BASE_IMAGE
-WORKDIR /workspace
-
-# Install aiohttp separately to avoid build errors.
-RUN pip install aiohttp
-# Install NumPy 1 instead of NumPy 2.
-RUN pip install "numpy<2"
-# Install the TPU and Pallas dependencies.
-RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
-# Fix FastAPI dependence
-RUN pip install "starlette<0.38.0"
-
-# Build vLLM.
-COPY . /workspace/vllm
-ENV VLLM_TARGET_DEVICE="tpu"
-RUN cd /workspace/vllm && python setup.py develop
-
-CMD ["/bin/bash"]
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,22 +0,0 @@
-FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
-
-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
-    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    rm /etc/apt/sources.list.d/intel-graphics.list && \
-    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
-    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
-    chmod 644 /usr/share/keyrings/intel-graphics.gpg
-
-RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
-
-COPY ./ /workspace/vllm
-
-WORKDIR /workspace/vllm
-
-RUN pip install -v -r requirements-xpu.txt
-
-RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
-
-CMD ["/bin/bash"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,5 +1,4 @@
 include LICENSE
-include requirements-adag.txt
 include requirements-common.txt
 include requirements-cuda.txt
 include requirements-rocm.txt
--- a/README.md
+++ b/README.md
@ -16,14 +16,26 @@ Easy, fast, and cheap LLM serving for everyone

 ---

+**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
+
+We are thrilled to announce our fourth vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
+Please register [here](https://lu.ma/agivllm) and join us!
+
+---
+
 *Latest News* 🔥
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2024/01] Added ROCm 6.0 support to vLLM.
+- [2023/12] Added ROCm 5.7 support to vLLM.
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
+- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
+- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
 - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
+- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
+- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).

 ---
@ -39,16 +51,14 @@ vLLM is fast with:
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
 - Optimized CUDA kernels

-**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
-
 vLLM is flexible and easy to use with:

 - Seamless integration with popular Hugging Face models
 - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor parallelism and pipeline parallelism support for distributed inference
+- Tensor parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
+- Support NVIDIA GPUs and AMD GPUs
 - (Experimental) Prefix caching support
 - (Experimental) Multi-lora support

@ -92,17 +102,14 @@ vLLM is a community project. Our compute resources for development and testing a
 - Databricks
 - DeepInfra
 - Dropbox
- Google Cloud
 - Lambda Lab
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
- Sequoia Capital
 - Trainy
 - UC Berkeley
 - UC San Diego
- ZhenFund

 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -4,13 +4,10 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import List, Optional

 import aiohttp
-import huggingface_hub.constants
 from tqdm.asyncio import tqdm
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)

 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)

@ -71,13 +68,9 @@ async def async_request_tgi(
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
-                        chunk_bytes = chunk_bytes.decode("utf-8")

-                        #NOTE: Sometimes TGI returns a ping response without
-                        # any data, we should skip it.
-                        if chunk_bytes.startswith(":"):
-                            continue
-                        chunk = remove_prefix(chunk_bytes, "data:")
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data:")

                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
@ -225,8 +218,8 @@ async def async_request_openai_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        "completions"
-    ), "OpenAI Completions API URL must end with 'completions'."
+        "v1/completions"
+    ), "OpenAI Completions API URL must end with 'v1/completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@ -265,9 +258,6 @@ async def async_request_openai_completions(
                        else:
                            data = json.loads(chunk)

-                            # NOTE: Some completion API might have a last
-                            # usage summary response without a token so we
-                            # want to check a token was generated
                            if data["choices"][0]["text"]:
                                timestamp = time.perf_counter()
                                # First token
@ -276,8 +266,12 @@ async def async_request_openai_completions(
                                    output.ttft = ttft

                                # Decoding phase
-                                output.itl.append(timestamp -
-                                                  most_recent_timestamp)
+                                # NOTE: Some completion API might have a last
+                                # usage summary response without a token so we
+                                # do not want to include as inter-token-latency
+                                elif data.get("usage", None) is None:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)

                                most_recent_timestamp = timestamp
                                generated_text += data["choices"][0]["text"]
@ -304,8 +298,8 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        "chat/completions"
-    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+        "v1/chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@ -390,30 +384,6 @@ def remove_prefix(text: str, prefix: str) -> str:
    return text


-def get_model(pretrained_model_name_or_path: str) -> str:
-    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
-        from modelscope import snapshot_download
-
-        model_path = snapshot_download(
-            model_id=pretrained_model_name_or_path,
-            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
-
-        return model_path
-    return pretrained_model_name_or_path
-
-
-def get_tokenizer(
-    pretrained_model_name_or_path: str, trust_remote_code: bool
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
-    if pretrained_model_name_or_path is not None and not os.path.exists(
-            pretrained_model_name_or_path):
-        pretrained_model_name_or_path = get_model(
-            pretrained_model_name_or_path)
-    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
-                                         trust_remote_code=trust_remote_code)
-
-
 ASYNC_REQUEST_FUNCS = {
    "tgi": async_request_tgi,
    "vllm": async_request_openai_completions,
@ -422,5 +392,4 @@ ASYNC_REQUEST_FUNCS = {
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
    "tensorrt-llm": async_request_trt_llm,
-    "scalellm": async_request_openai_completions,
 }
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -10,10 +10,8 @@ import torch
 from tqdm import tqdm

 from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptInputs
+from vllm.inputs import PromptStrictInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser


 def main(args: argparse.Namespace):
@ -21,33 +19,24 @@ def main(args: argparse.Namespace):

    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
-    llm = LLM(
-        model=args.model,
-        speculative_model=args.speculative_model,
-        num_speculative_tokens=args.num_speculative_tokens,
-        speculative_draft_tensor_parallel_size=\
-            args.speculative_draft_tensor_parallel_size,
-        tokenizer=args.tokenizer,
-        quantization=args.quantization,
-        tensor_parallel_size=args.tensor_parallel_size,
-        trust_remote_code=args.trust_remote_code,
-        dtype=args.dtype,
-        max_model_len=args.max_model_len,
-        enforce_eager=args.enforce_eager,
-        kv_cache_dtype=args.kv_cache_dtype,
-        quantization_param_path=args.quantization_param_path,
-        device=args.device,
-        ray_workers_use_nsight=args.ray_workers_use_nsight,
-        use_v2_block_manager=args.use_v2_block_manager,
-        enable_chunked_prefill=args.enable_chunked_prefill,
-        download_dir=args.download_dir,
-        block_size=args.block_size,
-        gpu_memory_utilization=args.gpu_memory_utilization,
-        load_format=args.load_format,
-        distributed_executor_backend=args.distributed_executor_backend,
-        otlp_traces_endpoint=args.otlp_traces_endpoint,
-        enable_prefix_caching=args.enable_prefix_caching,
-    )
+    llm = LLM(model=args.model,
+              speculative_model=args.speculative_model,
+              num_speculative_tokens=args.num_speculative_tokens,
+              tokenizer=args.tokenizer,
+              quantization=args.quantization,
+              tensor_parallel_size=args.tensor_parallel_size,
+              trust_remote_code=args.trust_remote_code,
+              dtype=args.dtype,
+              enforce_eager=args.enforce_eager,
+              kv_cache_dtype=args.kv_cache_dtype,
+              quantization_param_path=args.quantization_param_path,
+              device=args.device,
+              ray_workers_use_nsight=args.ray_workers_use_nsight,
+              use_v2_block_manager=args.use_v2_block_manager,
+              enable_chunked_prefill=args.enable_chunked_prefill,
+              download_dir=args.download_dir,
+              block_size=args.block_size,
+              gpu_memory_utilization=args.gpu_memory_utilization)

    sampling_params = SamplingParams(
        n=args.n,
@ -61,7 +50,7 @@ def main(args: argparse.Namespace):
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
-    dummy_inputs: List[PromptInputs] = [{
+    dummy_inputs: List[PromptStrictInputs] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]

@ -106,7 +95,7 @@ def main(args: argparse.Namespace):
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile_dir=None))
    latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90, 99]
+    percentages = [10, 25, 50, 75, 90]
    percentiles = np.percentile(latencies, percentages)
    print(f'Avg latency: {np.mean(latencies)} seconds')
    for percentage, percentile in zip(percentages, percentiles):
@ -124,16 +113,12 @@ def main(args: argparse.Namespace):


 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description='Benchmark the latency of processing a single batch of '
        'requests till completion.')
    parser.add_argument('--model', type=str, default='facebook/opt-125m')
    parser.add_argument('--speculative-model', type=str, default=None)
    parser.add_argument('--num-speculative-tokens', type=int, default=None)
-    parser.add_argument('--speculative-draft-tensor-parallel-size',
-                        '-spec-draft-tp',
-                        type=int,
-                        default=None)
    parser.add_argument('--tokenizer', type=str, default=None)
    parser.add_argument('--quantization',
                        '-q',
@ -159,12 +144,6 @@ if __name__ == '__main__':
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
    parser.add_argument(
        '--dtype',
        type=str,
@ -208,10 +187,9 @@ if __name__ == '__main__':
    parser.add_argument(
        "--device",
        type=str,
-        default="auto",
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
-        'CPU.')
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help='device type for vLLM execution, supporting CUDA and CPU.')
    parser.add_argument('--block-size',
                        type=int,
                        default=16,
@ -221,9 +199,6 @@ if __name__ == '__main__':
        action='store_true',
        help='If True, the prefill requests can be chunked based on the '
        'max_num_batched_tokens')
-    parser.add_argument("--enable-prefix-caching",
-                        action='store_true',
-                        help="Enable automatic prefix caching")
    parser.add_argument('--use-v2-block-manager', action='store_true')
    parser.add_argument(
        "--ray-workers-use-nsight",
@ -246,40 +221,5 @@ if __name__ == '__main__':
                        help='the fraction of GPU memory to be used for '
                        'the model executor, which can range from 0 to 1.'
                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument(
-        '--load-format',
-        type=str,
-        default=EngineArgs.load_format,
-        choices=[
-            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-            'bitsandbytes'
-        ],
-        help='The format of the model weights to load.\n\n'
-        '* "auto" will try to load the weights in the safetensors format '
-        'and fall back to the pytorch bin format if safetensors format '
-        'is not available.\n'
-        '* "pt" will load the weights in the pytorch bin format.\n'
-        '* "safetensors" will load the weights in the safetensors format.\n'
-        '* "npcache" will load the weights in pytorch format and store '
-        'a numpy cache to speed up the loading.\n'
-        '* "dummy" will initialize the weights with random values, '
-        'which is mainly for profiling.\n'
-        '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-        'section for more information.\n'
-        '* "bitsandbytes" will load the weights using bitsandbytes '
-        'quantization.\n')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    parser.add_argument(
-        '--otlp-traces-endpoint',
-        type=str,
-        default=None,
-        help='Target URL to which OpenTelemetry traces will be sent.')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -1,7 +1,7 @@
+import argparse
 import time

 from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser

 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501

@ -44,7 +44,7 @@ def main(args):


 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description='Benchmark the performance with or without automatic '
        'prefix caching.')
    parser.add_argument('--model',
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -2,8 +2,8 @@

 On the server side, run one of the following commands:
    vLLM OpenAI API server
-    vllm serve <your_model> \
-        --swap-space 16 \
+    python -m vllm.entrypoints.openai.api_server \
+        --model <your_model> --swap-space 16 \
        --disable-log-requests

    (TGI backend)
@ -17,7 +17,7 @@ On the client side, run:
        --dataset-path <path to dataset> \
        --request-rate <request_rate> \ # By default <request_rate> is inf
        --num-prompts <num_prompts> # By default <num_prompts> is 1000
-
+        
    when using tgi backend, add
        --endpoint /generate_stream
    to the end of the command above.
@ -31,7 +31,7 @@ import time
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
+from typing import AsyncGenerator, List, Optional, Tuple

 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@ -39,15 +39,7 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase

-try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
-    from backend_request_func import get_tokenizer
-
-try:
-    from vllm.utils import FlexibleArgumentParser
-except ImportError:
-    from argparse import ArgumentParser as FlexibleArgumentParser
+from vllm.transformers_utils.tokenizer import get_tokenizer


@dataclass
@ -60,16 +52,10 @@ class BenchmarkMetrics:
    output_throughput: float
    mean_ttft_ms: float
    median_ttft_ms: float
-    std_ttft_ms: float
    p99_ttft_ms: float
    mean_tpot_ms: float
    median_tpot_ms: float
-    std_tpot_ms: float
    p99_tpot_ms: float
-    mean_itl_ms: float
-    median_itl_ms: float
-    std_itl_ms: float
-    p99_itl_ms: float


 def sample_sharegpt_requests(
@ -80,6 +66,7 @@ def sample_sharegpt_requests(
 ) -> List[Tuple[str, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
+
    # Load the dataset.
    with open(dataset_path) as f:
        dataset = json.load(f)
@ -187,31 +174,6 @@ def sample_sonnet_requests(
    return sampled_requests


-def sample_random_requests(
-        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
-        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
-
-    input_lens = np.random.randint(
-        int(input_len * range_ratio),
-        input_len + 1,
-        size=num_prompts,
-    )
-    output_lens = np.random.randint(
-        int(output_len * range_ratio),
-        output_len + 1,
-        size=num_prompts,
-    )
-    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
-    input_requests = []
-    for i in range(num_prompts):
-        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
-                                   for j in range(input_lens[i])])
-        input_requests.append(
-            (prompt, int(input_lens[i]), int(output_lens[i])))
-
-    return input_requests
-
-
 async def get_request(
    input_requests: List[Tuple[str, int, int]],
    request_rate: float,
@ -223,7 +185,6 @@ async def get_request(
        if request_rate == float("inf"):
            # If the request rate is infinity, then we don't need to wait.
            continue
-
        # Sample the request interval from the exponential distribution.
        interval = np.random.exponential(1.0 / request_rate)
        # The next request will be sent after the interval.
@ -236,27 +197,19 @@ def calculate_metrics(
    dur_s: float,
    tokenizer: PreTrainedTokenizerBase,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens: List[int] = []
+    actual_output_lens = []
    total_input = 0
    completed = 0
-    itls: List[float] = []
-    tpots: List[float] = []
-    ttfts: List[float] = []
+    tpots = []
+    ttfts = []
    for i in range(len(outputs)):
        if outputs[i].success:
-            # We use the tokenizer to count the number of output tokens for all
-            # serving backends instead of looking at len(outputs[i].itl) since
-            # multiple output tokens may be bundled together
-            # Note : this may inflate the output token count slightly
-            output_len = len(
-                tokenizer(outputs[i].generated_text,
-                          add_special_tokens=False).input_ids)
+            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
            actual_output_lens.append(output_len)
            total_input += input_requests[i][1]
            if output_len > 1:
                tpots.append(
                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
-            itls += outputs[i].itl
            ttfts.append(outputs[i].ttft)
            completed += 1
        else:
@ -277,16 +230,10 @@ def calculate_metrics(
        mean_ttft_ms=np.mean(ttfts or 0) *
        1000,  # ttfts is empty if streaming is not supported by backend
        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        std_ttft_ms=np.std(ttfts or 0) * 1000,
        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
-        std_tpot_ms=np.std(tpots or 0) * 1000,
        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
-        mean_itl_ms=np.mean(itls or 0) * 1000,
-        median_itl_ms=np.median(itls or 0) * 1000,
-        std_itl_ms=np.std(itls or 0) * 1000,
-        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
    )

    return metrics, actual_output_lens
@ -304,7 +251,7 @@ async def benchmark(
    disable_tqdm: bool,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS[backend]
+        request_func = ASYNC_REQUEST_FUNCS.get(backend)
    else:
        raise ValueError(f"Unknown backend: {backend}")

@ -331,7 +278,7 @@ async def benchmark(
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

    benchmark_start_time = time.perf_counter()
-    tasks: List[asyncio.Task] = []
+    tasks = []
    async for request in get_request(input_requests, request_rate):
        prompt, prompt_len, output_len = request
        request_func_input = RequestFuncInput(
@ -349,7 +296,7 @@ async def benchmark(
                             pbar=pbar)))
    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

-    if pbar is not None:
+    if not disable_tqdm:
        pbar.close()

    benchmark_duration = time.perf_counter() - benchmark_start_time
@ -386,10 +333,6 @@ async def benchmark(
    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
                                    metrics.median_tpot_ms))
    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
-    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
-    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
-    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
-    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
    print("=" * 50)

    result = {
@ -402,16 +345,10 @@ async def benchmark(
        "output_throughput": metrics.output_throughput,
        "mean_ttft_ms": metrics.mean_ttft_ms,
        "median_ttft_ms": metrics.median_ttft_ms,
-        "std_ttft_ms": metrics.std_ttft_ms,
        "p99_ttft_ms": metrics.p99_ttft_ms,
        "mean_tpot_ms": metrics.mean_tpot_ms,
        "median_tpot_ms": metrics.median_tpot_ms,
-        "std_tpot_ms": metrics.std_tpot_ms,
        "p99_tpot_ms": metrics.p99_tpot_ms,
-        "mean_itl_ms": metrics.mean_itl_ms,
-        "median_itl_ms": metrics.median_itl_ms,
-        "std_itl_ms": metrics.std_itl_ms,
-        "p99_itl_ms": metrics.p99_itl_ms,
        "input_lens": [output.prompt_len for output in outputs],
        "output_lens": actual_output_lens,
        "ttfts": [output.ttft for output in outputs],
@ -490,15 +427,6 @@ def main(args: argparse.Namespace):
                              for prompt, prompt_formatted, prompt_len,
                              output_len in input_requests]

-    elif args.dataset_name == "random":
-        input_requests = sample_random_requests(
-            input_len=args.random_input_len,
-            output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
-            range_ratio=args.random_range_ratio,
-            tokenizer=tokenizer,
-        )
-
    else:
        raise ValueError(f"Unknown dataset: {args.dataset_name}")

@ -517,7 +445,7 @@ def main(args: argparse.Namespace):

    # Save config and results to json
    if args.save_result:
-        result_json: Dict[str, Any] = {}
+        result_json = {}

        # Setup
        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
@ -550,8 +478,6 @@ def main(args: argparse.Namespace):
        # Save to file
        base_model_id = model_id.split("/")[-1]
        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
-        if args.result_filename:
-            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
        with open(file_name, "w") as outfile:
@ -559,7 +485,7 @@ def main(args: argparse.Namespace):


 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description="Benchmark the online serving throughput.")
    parser.add_argument(
        "--backend",
@ -592,7 +518,7 @@ if __name__ == "__main__":
        "--dataset-name",
        type=str,
        default="sharegpt",
-        choices=["sharegpt", "sonnet", "random"],
+        choices=["sharegpt", "sonnet"],
        help="Name of the dataset to benchmark on.",
    )
    parser.add_argument("--dataset-path",
@ -609,7 +535,7 @@ if __name__ == "__main__":
        "--tokenizer",
        type=str,
        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        "Name or path of the tokenizer, if not using the default tokenizer.",
    )
    parser.add_argument(
        "--best-of",
@ -652,27 +578,6 @@ if __name__ == "__main__":
        help=
        "Number of prefix tokens per request, used only for sonnet dataset.",
    )
-    parser.add_argument(
-        "--random-input-len",
-        type=int,
-        default=1024,
-        help=
-        "Number of input tokens per request, used only for random sampling.",
-    )
-    parser.add_argument(
-        "--random-output-len",
-        type=int,
-        default=128,
-        help=
-        "Number of output tokens per request, used only for random sampling.",
-    )
-    parser.add_argument(
-        "--random-range-ratio",
-        type=float,
-        default=1.0,
-        help="Range of sampled ratio of input/output length, "
-        "used only for random sampling.",
-    )
    parser.add_argument(
        "--request-rate",
        type=float,
@ -713,15 +618,6 @@ if __name__ == "__main__":
        help="Specify directory to save benchmark json results."
        "If not specified, results are saved in the current directory.",
    )
-    parser.add_argument(
-        "--result-filename",
-        type=str,
-        default=None,
-        help="Specify the filename to save benchmark json results."
-        "If not specified, results will be saved in "
-        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
-        " format.",
-    )

    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -10,9 +10,7 @@ from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)

-from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser


 def sample_requests(
@ -80,10 +78,8 @@ def run_vllm(
    enable_prefix_caching: bool,
    enable_chunked_prefill: bool,
    max_num_batched_tokens: int,
-    distributed_executor_backend: Optional[str],
    gpu_memory_utilization: float = 0.9,
    download_dir: Optional[str] = None,
-    load_format: str = EngineArgs.load_format,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
@ -104,13 +100,11 @@ def run_vllm(
        download_dir=download_dir,
        enable_chunked_prefill=enable_chunked_prefill,
        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
-        load_format=load_format,
    )

    # Add the requests to the engine.
-    prompts: List[str] = []
-    sampling_params: List[SamplingParams] = []
+    prompts = []
+    sampling_params = []
    for prompt, _, output_len in requests:
        prompts.append(prompt)
        sampling_params.append(
@ -231,8 +225,8 @@ def main(args: argparse.Namespace):
            args.enforce_eager, args.kv_cache_dtype,
            args.quantization_param_path, args.device,
            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.distributed_executor_backend,
-            args.gpu_memory_utilization, args.download_dir, args.load_format)
+            args.max_num_batched_tokens, args.gpu_memory_utilization,
+            args.download_dir)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -262,7 +256,7 @@ def main(args: argparse.Namespace):


 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
    parser.add_argument("--backend",
                        type=str,
                        choices=["vllm", "hf", "mii"],
@ -349,10 +343,9 @@ if __name__ == "__main__":
    parser.add_argument(
        "--device",
        type=str,
-        default="auto",
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
-        'CPU.')
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help='device type for vLLM execution, supporting CUDA and CPU.')
    parser.add_argument(
        "--enable-prefix-caching",
        action='store_true',
@ -375,36 +368,6 @@ if __name__ == "__main__":
        type=str,
        default=None,
        help='Path to save the throughput results in JSON format.')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    parser.add_argument(
-        '--load-format',
-        type=str,
-        default=EngineArgs.load_format,
-        choices=[
-            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-            'bitsandbytes'
-        ],
-        help='The format of the model weights to load.\n\n'
-        '* "auto" will try to load the weights in the safetensors format '
-        'and fall back to the pytorch bin format if safetensors format '
-        'is not available.\n'
-        '* "pt" will load the weights in the pytorch bin format.\n'
-        '* "safetensors" will load the weights in the safetensors format.\n'
-        '* "npcache" will load the weights in pytorch format and store '
-        'a numpy cache to speed up the loading.\n'
-        '* "dummy" will initialize the weights with random values, '
-        'which is mainly for profiling.\n'
-        '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-        'section for more information.\n'
-        '* "bitsandbytes" will load the weights using bitsandbytes '
-        'quantization.\n')
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -1,360 +0,0 @@
-import argparse
-import copy
-import itertools
-import pickle as pkl
-import time
-from typing import Callable, Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
-
-# impl
-
-
-def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
-                    scale_b: torch.Tensor,
-                    out_dtype: torch.dtype) -> torch.Tensor:
-    return torch.mm(a, b)
-
-
-def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
-                     scale_b: torch.Tensor,
-                     out_dtype: torch.dtype) -> torch.Tensor:
-    return torch._scaled_mm(a,
-                            b,
-                            scale_a=scale_a,
-                            scale_b=scale_b,
-                            out_dtype=out_dtype)
-
-
-def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
-                                scale_a: torch.Tensor, scale_b: torch.Tensor,
-                                out_dtype: torch.dtype) -> torch.Tensor:
-    return torch._scaled_mm(a,
-                            b,
-                            scale_a=scale_a,
-                            scale_b=scale_b,
-                            out_dtype=out_dtype,
-                            use_fast_accum=True)
-
-
-def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
-                 scale_b: torch.Tensor,
-                 out_dtype: torch.dtype) -> torch.Tensor:
-    return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
-
-
-# bench
-def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
-             scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
-             sub_label: str, fn: Callable, description: str) -> TMeasurement:
-
-    min_run_time = 1
-
-    globals = {
-        "a": a,
-        "b": b,
-        "scale_a": scale_a,
-        "scale_b": scale_b,
-        "out_dtype": out_dtype,
-        "fn": fn,
-    }
-    return TBenchmark.Timer(
-        stmt="fn(a, b, scale_a, scale_b, out_dtype)",
-        globals=globals,
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.int8
-    a, b = make_rand_tensors(torch.int8, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-
-    timers = []
-    # pytorch impl - bfloat16
-    timers.append(
-        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
-                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
-                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(a.to(dtype=torch.float16, device="cuda"),
-                 b.to(dtype=torch.float16, device="cuda"), scale_a, scale_b,
-                 torch.float16, label, sub_label, pytorch_mm_impl,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales"))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
-
-    return timers
-
-
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.float8_e4m3fn
-    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-
-    timers = []
-
-    # pytorch impl w. bf16
-    timers.append(
-        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
-                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
-                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 pytorch_fp8_impl_fast_accum,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 pytorch_fp8_impl_fast_accum,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
-    return timers
-
-
-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
-    raise ValueError("unsupported type")
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-
-    results = []
-    for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args.dtype, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        raise ValueError("unsupported dtype")
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark Cutlass GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8']")
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@ -1,43 +0,0 @@
-# Weight Shapes are in the format
-# ([K, N], TP_SPLIT_DIM)
-# Example:
-#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
-#   - TP1 : K = 14336, N = 4096
-#   - TP2 : K = 7168, N = 4096
-#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
-#   - TP1 : K = 4096, N = 6144
-#   - TP4 : K = 4096, N = 1536
-
-# TP1 shapes
-WEIGHT_SHAPES = {
-    "mistralai/Mistral-7B-v0.1": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-7b-hf": [
-        ([4096, 12288], 1),
-        ([4096, 4096], 0),
-        ([4096, 22016], 1),
-        ([11008, 4096], 0),
-    ],
-    "meta-llama/Llama-3-8b": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-13b-hf": [
-        ([5120, 15360], 1),
-        ([5120, 5120], 0),
-        ([5120, 27648], 1),
-        ([13824, 5120], 0),
-    ],
-    "meta-llama/Llama-2-70b-hf": [
-        ([8192, 10240], 1),
-        ([8192, 8192], 0),
-        ([8192, 57344], 1),
-        ([28672, 8192], 0),
-    ],
-}
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@ -1,3 +1,4 @@
+import argparse
 import os
 import sys
 from typing import Optional
@ -9,7 +10,6 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.aqlm import (
    dequantize_weight, generic_dequantize_gemm, get_int_dtype,
    optimized_dequantize_gemm)
-from vllm.utils import FlexibleArgumentParser

 os.environ['CUDA_VISIBLE_DEVICES'] = '0'

@ -86,9 +86,9 @@ def dequant_no_scale(
 # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
 # the generic pytorch version.
 # Just visual comparison.
-def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
+def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:

-    n = int(parts.sum().item())
+    n = parts.sum().item()

    device = torch.device('cuda:0')

@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:

 def main():

-    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
+    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")

    # Add arguments
    parser.add_argument("--nbooks",
@ -204,7 +204,7 @@ def main():
        sys.stdout = sys.__stdout__


-def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
+def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
             methods):

    # I didn't see visible improvements from increasing these, but feel free :)
@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
    print('')


-def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
+def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
               nbooks: int, bits: int, method) -> float:

-    n = int(parts.sum().item())
+    n = parts.sum().item()

    device = torch.device('cuda:0')

--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -1,24 +1,20 @@
-from typing import List
+import argparse

 import torch
 import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    MarlinWorkspace, marlin_quantize)
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
-    marlin_24_quantize)
+    MarlinWorkspace, marlin_24_quantize, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, gptq_quantize_weights, sort_weights)
-from vllm.scalar_type import ScalarType
-from vllm.utils import FlexibleArgumentParser
+    gptq_pack, quantize_weights, sort_weights)

 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@ -27,15 +23,13 @@ ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]


-def bench_run(results: List[benchmark.Measurement], model: str,
-              act_order: bool, is_k_full: bool, quant_type: ScalarType,
-              group_size: int, size_m: int, size_k: int, size_n: int):
+def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
+              size_m, size_k, size_n):
    label = "Quant Matmul"

-    sub_label = ("{}, act={} k_full={}, q={}, g={}, "
-                 "MKN=({}x{}x{})".format(model, act_order, is_k_full,
-                                         str(quant_type), group_size, size_m,
-                                         size_k, size_n))
+    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
+                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
+                                         group_size, size_m, size_k, size_n))

    print(f"Testing: {sub_label}")

@ -52,18 +46,16 @@ def bench_run(results: List[benchmark.Measurement], model: str,
        marlin_g_idx,
        marlin_sort_indices,
        marlin_rand_perm,
-    ) = marlin_quantize(b, quant_type, group_size, act_order)
+    ) = marlin_quantize(b, num_bits, group_size, act_order)

    # Marlin_24 quant
    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b, quant_type, group_size)
-
-    marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
+     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)

    # GPTQ quant
    (w_ref, q_w, s, g_idx,
-     rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order)
-    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
+     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
+    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)

    # For act_order, sort the "weights" and "g_idx"
    # so that group ids are increasing
@ -77,11 +69,10 @@ def bench_run(results: List[benchmark.Measurement], model: str,

    marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
                                          GPTQ_MARLIN_24_MAX_PARALLEL)
-    marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)

    globals = {
        # Gen params
-        "quant_type": quant_type,
+        "num_bits": num_bits,
        "group_size": group_size,
        "size_m": size_m,
        "size_n": size_n,
@ -92,7 +83,6 @@ def bench_run(results: List[benchmark.Measurement], model: str,
        "marlin_w_ref": marlin_w_ref,
        "marlin_q_w": marlin_q_w,
        "marlin_s": marlin_s,
-        "marlin_zp": marlin_zp,
        "marlin_g_idx": marlin_g_idx,
        "marlin_sort_indices": marlin_sort_indices,
        "marlin_rand_perm": marlin_rand_perm,
@ -131,29 +121,19 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    results.append(
        benchmark.Timer(
            stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="gptq_marlin_gemm_fp16",
+            description="gptq_marlin_gemm",
        ).blocked_autorange(min_run_time=min_run_time))

-    results.append(
-        benchmark.Timer(
-            stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="gptq_marlin_gemm_fp32",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
+    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
            and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
        results.append(
            benchmark.Timer(
                stmt=
-                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
+                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
                globals=globals,
                label=label,
                sub_label=sub_label,
@ -163,7 +143,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    results.append(
        benchmark.Timer(
            stmt=
-            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
+            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@ -176,7 +156,7 @@ def main(args):
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")

-    results: List[benchmark.Measurement] = []
+    results = []

    for model in args.models:
        for layer in WEIGHT_SHAPES[model]:
@ -199,13 +179,12 @@ def main(args):
                           ) > 0 and is_k_full not in args.limit_k_full:
                        continue

-                    for quant_type in query_marlin_supported_quant_types(
-                            False):
-                        if len(args.limit_num_bits) > 0 and \
-                            quant_type.size_bits not in args.limit_num_bits:
+                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+                        if len(args.limit_num_bits
+                               ) > 0 and num_bits not in args.limit_num_bits:
                            continue

-                        for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
+                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
                            if len(
                                    args.limit_group_size
                            ) > 0 and group_size not in args.limit_group_size:
@ -219,8 +198,8 @@ def main(args):

                            for size_m in args.batch_sizes:
                                bench_run(results, model, act_order, is_k_full,
-                                          quant_type, group_size, size_m,
-                                          size_k, size_n)
+                                          num_bits, group_size, size_m, size_k,
+                                          size_n)

    compare = benchmark.Compare(results)
    compare.print()
@ -230,7 +209,7 @@ def main(args):
 #   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
 #
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description="Benchmark Marlin across specified models/shapes/batches")
    parser.add_argument(
        "--models",
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@ -0,0 +1,239 @@
+import argparse
+import json
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+import triton
+from tqdm import tqdm
+
+from vllm.model_executor.layers.fused_moe import (fused_moe,
+                                                  get_config_file_name)
+
+
+def main(model, tp_size, gpu, dtype: str):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
+    method = fused_moe
+    for bs in [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+    ]:
+        run_grid(bs,
+                 model=model,
+                 method=method,
+                 gpu=gpu,
+                 tp_size=tp_size,
+                 dtype=dtype)
+
+
+def run_grid(bs, model, method, gpu, tp_size, dtype: str):
+    if model == '8x7B':
+        d_model = 4096
+        model_intermediate_size = 14336
+        num_layers = 32
+    elif model == '8x22B':
+        d_model = 6144
+        model_intermediate_size = 16384
+        num_layers = 56
+    else:
+        raise ValueError(f'Unsupported Mixtral model {model}')
+    num_total_experts = 8
+    top_k = 2
+    # tp_size = 2
+    num_calls = 100
+
+    num_warmup_trials = 1
+    num_trials = 1
+
+    configs = []
+
+    for block_size_n in [32, 64, 128, 256]:
+        for block_size_m in [16, 32, 64, 128, 256]:
+            for block_size_k in [64, 128, 256]:
+                for group_size_m in [1, 16, 32, 64]:
+                    for num_warps in [4, 8]:
+                        for num_stages in [2, 3, 4, 5]:
+                            configs.append({
+                                "BLOCK_SIZE_M": block_size_m,
+                                "BLOCK_SIZE_N": block_size_n,
+                                "BLOCK_SIZE_K": block_size_k,
+                                "GROUP_SIZE_M": group_size_m,
+                                "num_warps": num_warps,
+                                "num_stages": num_stages,
+                            })
+
+    best_config = None
+    best_time_us = 1e20
+
+    print(f'{tp_size=} {bs=}')
+
+    for config in tqdm(configs):
+        # warmup
+        try:
+            for _ in range(num_warmup_trials):
+                run_timing(
+                    num_calls=num_calls,
+                    bs=bs,
+                    d_model=d_model,
+                    num_total_experts=num_total_experts,
+                    top_k=top_k,
+                    tp_size=tp_size,
+                    model_intermediate_size=model_intermediate_size,
+                    method=method,
+                    config=config,
+                    dtype=dtype,
+                )
+        except triton.runtime.autotuner.OutOfResources:
+            continue
+
+        # trial
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                bs=bs,
+                d_model=d_model,
+                num_total_experts=num_total_experts,
+                top_k=top_k,
+                tp_size=tp_size,
+                model_intermediate_size=model_intermediate_size,
+                method=method,
+                config=config,
+                dtype=dtype,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+            model_dur_ms = kernel_dur_ms * num_layers
+
+            if kernel_dur_us < best_time_us:
+                best_config = config
+                best_time_us = kernel_dur_us
+
+                tqdm.write(
+                    f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
+                    f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
+                    f'{d_model=} {model_intermediate_size=} {num_layers=}')
+
+    print("best_time_us", best_time_us)
+    print("best_config", best_config)
+
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(num_total_experts,
+                                    model_intermediate_size // tp_size,
+                                    "float8" if dtype == "float8" else None)
+    print(f"writing config to file {filename}")
+    existing_content = {}
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")
+
+
+def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
+               top_k: int, tp_size: int, model_intermediate_size: int, method,
+               config, dtype: str) -> float:
+    shard_intermediate_size = model_intermediate_size // tp_size
+
+    hidden_states = torch.rand(
+        (bs, d_model),
+        device="cuda:0",
+        dtype=torch.float16,
+    )
+
+    w1 = torch.rand(
+        (num_total_experts, 2 * shard_intermediate_size, d_model),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w2 = torch.rand(
+        (num_total_experts, d_model, shard_intermediate_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+
+    if dtype == "float8":
+        w1 = w1.to(torch.float8_e4m3fn)
+        w2 = w2.to(torch.float8_e4m3fn)
+        w1_scale = torch.ones(num_total_experts,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        w2_scale = torch.ones(num_total_experts,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        a1_scale = torch.ones(1,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        a2_scale = torch.ones(1,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+
+    gating_output = F.softmax(torch.rand(
+        (num_calls, bs, num_total_experts),
+        device=hidden_states.device,
+        dtype=torch.float32,
+    ),
+                              dim=-1)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for i in range(num_calls):
+        hidden_states = method(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            gating_output=gating_output[i],
+            topk=2,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+            use_fp8=dtype == "float8",
+        )
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='benchmark_mixtral_moe',
+        description='Benchmark and tune the fused_moe kernel',
+    )
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['float8', 'float16'],
+        help='Data type used for fused_moe kernel computations',
+    )
+    parser.add_argument('--model',
+                        type=str,
+                        default='8x7B',
+                        choices=['8x7B', '8x22B'],
+                        help='The Mixtral model to benchmark')
+    parser.add_argument('--tp-size',
+                        type=int,
+                        default=2,
+                        help='Tensor paralleli size')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=0,
+                        help="GPU ID for benchmarking")
+    args = parser.parse_args()
+    sys.exit(main(args.model, args.tp_size, args.gpu, args.dtype))
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -1,333 +0,0 @@
-import argparse
-import time
-from datetime import datetime
-from typing import Any, Dict, List, Tuple, TypedDict
-
-import ray
-import torch
-import triton
-from ray.experimental.tqdm_ray import tqdm
-from transformers import AutoConfig
-
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.utils import FlexibleArgumentParser
-
-
-class BenchmarkConfig(TypedDict):
-    BLOCK_SIZE_M: int
-    BLOCK_SIZE_N: int
-    BLOCK_SIZE_K: int
-    GROUP_SIZE_M: int
-    num_warps: int
-    num_stages: int
-
-
-def benchmark_config(
-    config: BenchmarkConfig,
-    num_tokens: int,
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8: bool,
-    num_iters: int = 100,
-) -> float:
-    init_dtype = torch.float16 if use_fp8 else dtype
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    w1 = torch.randn(num_experts,
-                     shard_intermediate_size,
-                     hidden_size,
-                     dtype=init_dtype)
-    w2 = torch.randn(num_experts,
-                     hidden_size,
-                     shard_intermediate_size // 2,
-                     dtype=init_dtype)
-    gating_output = torch.randn(num_iters,
-                                num_tokens,
-                                num_experts,
-                                dtype=torch.float32)
-
-    w1_scale = None
-    w2_scale = None
-    a1_scale = None
-    a2_scale = None
-    if use_fp8:
-        w1_scale = torch.randn(num_experts, dtype=torch.float32)
-        w2_scale = torch.randn(num_experts, dtype=torch.float32)
-        a1_scale = torch.randn(1, dtype=torch.float32)
-        a2_scale = torch.randn(1, dtype=torch.float32)
-
-        w1 = w1.to(torch.float8_e4m3fn)
-        w2 = w2.to(torch.float8_e4m3fn)
-
-    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
-
-    def prepare(i: int):
-        input_gating.copy_(gating_output[i])
-
-    def run():
-        fused_moe(
-            x,
-            w1,
-            w2,
-            input_gating,
-            topk,
-            renormalize=True,
-            inplace=True,
-            override_config=config,
-            use_fp8=use_fp8,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-        )
-
-    # JIT compilation & warmup
-    run()
-    torch.cuda.synchronize()
-
-    # Capture 10 invocations with CUDA graph
-    graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(graph):
-        for _ in range(10):
-            run()
-    torch.cuda.synchronize()
-
-    # Warmup
-    for _ in range(5):
-        graph.replay()
-    torch.cuda.synchronize()
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    latencies: List[float] = []
-    for i in range(num_iters):
-        prepare(i)
-        torch.cuda.synchronize()
-
-        start_event.record()
-        graph.replay()
-        end_event.record()
-        end_event.synchronize()
-        latencies.append(start_event.elapsed_time(end_event))
-    avg = sum(latencies) / (num_iters * 10) * 1000  # us
-    graph.reset()
-    return avg
-
-
-def get_configs_compute_bound() -> List[Dict[str, int]]:
-    # Reduced search space for faster tuning.
-    # TODO(woosuk): Increase the search space and use a performance model to
-    # prune the search space.
-    configs: List[BenchmarkConfig] = []
-    for num_stages in [2, 3, 4, 5]:
-        for block_m in [16, 32, 64, 128, 256]:
-            for block_k in [64, 128, 256]:
-                for block_n in [32, 64, 128, 256]:
-                    for num_warps in [4, 8]:
-                        for group_size in [1, 16, 32, 64]:
-                            configs.append({
-                                "BLOCK_SIZE_M": block_m,
-                                "BLOCK_SIZE_N": block_n,
-                                "BLOCK_SIZE_K": block_k,
-                                "GROUP_SIZE_M": group_size,
-                                "num_warps": num_warps,
-                                "num_stages": num_stages,
-                            })
-    return configs
-
-
-@ray.remote(num_gpus=1)
-class BenchmarkWorker:
-
-    def __init__(self, seed: int) -> None:
-        torch.set_default_device("cuda")
-        torch.cuda.manual_seed_all(seed)
-        self.seed = seed
-
-    def benchmark(
-        self,
-        num_tokens: int,
-        num_experts: int,
-        shard_intermediate_size: int,
-        hidden_size: int,
-        topk: int,
-        dtype: torch.dtype,
-        use_fp8: bool,
-    ) -> Tuple[Dict[str, int], float]:
-        torch.cuda.manual_seed_all(self.seed)
-
-        dtype_str = "float8" if use_fp8 else None
-        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
-        # is the intermediate size after silu_and_mul.
-        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
-        if op_config is None:
-            config = get_default_config(num_tokens, num_experts,
-                                        shard_intermediate_size, hidden_size,
-                                        topk, dtype_str)
-        else:
-            config = op_config[min(op_config.keys(),
-                                   key=lambda x: abs(x - num_tokens))]
-        kernel_time = benchmark_config(config, num_tokens, num_experts,
-                                       shard_intermediate_size, hidden_size,
-                                       topk, dtype, use_fp8)
-        return config, kernel_time
-
-    def tune(
-        self,
-        num_tokens: int,
-        num_experts: int,
-        shard_intermediate_size: int,
-        hidden_size: int,
-        topk: int,
-        dtype: torch.dtype,
-        use_fp8: bool,
-        search_space: List[BenchmarkConfig],
-    ) -> BenchmarkConfig:
-        best_config = None
-        best_time = float("inf")
-        for config in tqdm(search_space):
-            try:
-                kernel_time = benchmark_config(config,
-                                               num_tokens,
-                                               num_experts,
-                                               shard_intermediate_size,
-                                               hidden_size,
-                                               topk,
-                                               dtype,
-                                               use_fp8,
-                                               num_iters=10)
-            except triton.runtime.autotuner.OutOfResources:
-                # Some configurations may be invalid and fail to compile.
-                continue
-
-            if kernel_time < best_time:
-                best_time = kernel_time
-                best_config = config
-        now = datetime.now()
-        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
-        assert best_config is not None
-        return best_config
-
-
-def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
-    return {
-        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
-        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
-        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
-        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
-        "num_warps": config["num_warps"],
-        "num_stages": config["num_stages"],
-    }
-
-
-def save_configs(
-    configs: Dict[int, BenchmarkConfig],
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8: bool,
-) -> None:
-    dtype_str = "float8" if use_fp8 else None
-    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
-    # is the intermediate size after silu_and_mul.
-    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
-    print(f"Writing best config to {filename}...")
-    with open(filename, "w") as f:
-        json.dump(configs, f, indent=4)
-        f.write("\n")
-
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    config = AutoConfig.from_pretrained(args.model)
-    if config.architectures[0] == "DbrxForCausalLM":
-        E = config.ffn_config.moe_num_experts
-        topk = config.ffn_config.moe_top_k
-        intermediate_size = config.ffn_config.ffn_hidden_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    else:
-        # Default: Mixtral.
-        E = config.num_local_experts
-        topk = config.num_experts_per_tok
-        intermediate_size = config.intermediate_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-
-    hidden_size = config.hidden_size
-    dtype = config.torch_dtype
-    use_fp8 = args.dtype == "fp8"
-
-    if args.batch_size is None:
-        batch_sizes = [
-            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
-            2048, 3072, 4096
-        ]
-    else:
-        batch_sizes = [args.batch_size]
-
-    ray.init()
-    num_gpus = int(ray.available_resources()["GPU"])
-    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
-
-    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
-        outputs = []
-        worker_idx = 0
-        for input_args in inputs:
-            worker = workers[worker_idx]
-            worker_method = getattr(worker, method)
-            output = worker_method.remote(*input_args)
-            outputs.append(output)
-            worker_idx = (worker_idx + 1) % num_gpus
-        return ray.get(outputs)
-
-    if args.tune:
-        search_space = get_configs_compute_bound()
-        print(f"Start tuning over {len(search_space)} configurations...")
-
-        start = time.time()
-        configs = _distribute(
-            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8, search_space)
-                     for batch_size in batch_sizes])
-        best_configs = {
-            M: sort_config(config)
-            for M, config in zip(batch_sizes, configs)
-        }
-        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8)
-        end = time.time()
-        print(f"Tuning took {end - start:.2f} seconds")
-    else:
-        outputs = _distribute("benchmark",
-                              [(batch_size, E, shard_intermediate_size,
-                                hidden_size, topk, dtype, use_fp8)
-                               for batch_size in batch_sizes])
-
-        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
-            print(f"Batch size: {batch_size}, config: {config}")
-            print(f"Kernel time: {kernel_time:.2f} us")
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser.add_argument("--model",
-                        type=str,
-                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
-    parser.add_argument("--tp-size", "-tp", type=int, default=2)
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["auto", "fp8"],
-                        default="auto")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, required=False)
-    parser.add_argument("--tune", action="store_true")
-    args = parser.parse_args()
-
-    main(args)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -1,12 +1,12 @@
+import argparse
 import random
 import time
-from typing import List, Optional
+from typing import Optional

 import torch

 from vllm import _custom_ops as ops
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
-                        create_kv_caches_with_random)
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random

 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@ -54,17 +54,14 @@ def main(

    # Create the block tables.
    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables_lst: List[List[int]] = []
+    block_tables = []
    for _ in range(num_seqs):
        block_table = [
            random.randint(0, NUM_BLOCKS - 1)
            for _ in range(max_num_blocks_per_seq)
        ]
-        block_tables_lst.append(block_table)
-
-    block_tables = torch.tensor(block_tables_lst,
-                                dtype=torch.int,
-                                device=device)
+        block_tables.append(block_table)
+    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)

    # Create the KV cache.
    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
@ -100,7 +97,7 @@ def main(
        start_time = time.perf_counter()

        # Using default kv_scale
-        k_scale = v_scale = 1.0
+        kv_scale = 1.0

        for _ in range(num_iters):
            if version == "v1":
@ -117,8 +114,7 @@ def main(
                    max_seq_len,
                    alibi_slopes,
                    kv_cache_dtype,
-                    k_scale,
-                    v_scale,
+                    kv_scale,
                )
            elif version == "v2":
                ops.paged_attention_v2(
@ -137,8 +133,7 @@ def main(
                    max_seq_len,
                    alibi_slopes,
                    kv_cache_dtype,
-                    k_scale,
-                    v_scale,
+                    kv_scale,
                )
            else:
                raise ValueError(f"Invalid version: {version}")
@ -163,19 +158,19 @@ def main(


 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description="Benchmark the paged attention kernel.")
    parser.add_argument("--version",
                        type=str,
                        choices=["v1", "v2"],
                        default="v2")
    parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument("--seq-len", type=int, default=4096)
+    parser.add_argument("--seq_len", type=int, default=4096)
    parser.add_argument("--num-query-heads", type=int, default=64)
    parser.add_argument("--num-kv-heads", type=int, default=8)
    parser.add_argument("--head-size",
                        type=int,
-                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 128, 192, 256],
                        default=128)
    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
    parser.add_argument("--use-alibi", action="store_true")
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -1,12 +1,11 @@
+import argparse
 from itertools import accumulate
-from typing import List, Optional
+from typing import Optional

 import nvtx
 import torch

-from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
-                                                         get_rope)
-from vllm.utils import FlexibleArgumentParser
+from vllm.model_executor.layers.rotary_embedding import get_rope


 def benchmark_rope_kernels_multi_lora(
@ -38,7 +37,7 @@ def benchmark_rope_kernels_multi_lora(
                            })
    # non-batched RoPE takes only one scaling factor, we create multiple
    # instances to simulate the same behavior
-    non_batched_ropes: List[RotaryEmbedding] = []
+    non_batched_ropes = []
    for scaling_factor in scaling_factors:
        non_batched_ropes.append(
            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
@ -86,7 +85,7 @@ def benchmark_rope_kernels_multi_lora(


 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description="Benchmark the rotary embedding kernels.")
    parser.add_argument("--is-neox-style", type=bool, default=True)
    parser.add_argument("--batch-size", type=int, default=16)
@ -94,7 +93,7 @@ if __name__ == '__main__':
    parser.add_argument("--num-heads", type=int, default=8)
    parser.add_argument("--head-size",
                        type=int,
-                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 128, 192, 256],
                        default=128)
    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
    parser.add_argument("--dtype",
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@ -1,8 +1,8 @@
+import argparse
 import cProfile
 import pstats

 from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser

 # A very long prompt, total number of tokens is about 15k.
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"
@ -47,7 +47,7 @@ def main(args):


 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description='Benchmark the performance of hashing function in'
        'automatic prefix caching.')
    parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -12,7 +12,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
-list(APPEND CXX_COMPILE_FLAGS
+list(APPEND CXX_COMPILE_FLAGS 
    "-fopenmp"
    "-DVLLM_CPU_EXTENSION")

@ -33,23 +33,9 @@ function (find_isa CPUINFO TARGET OUT)
    endif()
 endfunction()

-function (is_avx512_disabled OUT)
-    set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
-    if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
-        set(${OUT} ON PARENT_SCOPE)
-    else()
-        set(${OUT} OFF PARENT_SCOPE)
-    endif()
-endfunction()
-
-is_avx512_disabled(AVX512_DISABLED)
-
-find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
-find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
-find_isa(${CPUINFO} "POWER9" POWER9_FOUND)

-if (AVX512_FOUND AND NOT AVX512_DISABLED)
+if (AVX512_FOUND)
    list(APPEND CXX_COMPILE_FLAGS
        "-mavx512f"
        "-mavx512vl"
@ -58,8 +44,8 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)

    find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
    if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
-            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND 
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) 
            list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
        else()
            message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
@ -67,24 +53,12 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
    else()
        message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
    endif()
-elseif (AVX2_FOUND)
-    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
-    message(WARNING "vLLM CPU backend using AVX2 ISA")
-elseif (POWER9_FOUND OR POWER10_FOUND)
-    message(STATUS "PowerPC detected")
-    # Check for PowerPC VSX support
-    list(APPEND CXX_COMPILE_FLAGS
-        "-mvsx"
-        "-mcpu=native"
-        "-mtune=native")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
 endif()

 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")

-list(APPEND LIBS "numa")
-

 #
 # Define extension targets
@ -97,21 +71,20 @@ set(VLLM_EXT_SRC
    "csrc/cpu/activation.cpp"
    "csrc/cpu/attention.cpp"
    "csrc/cpu/cache.cpp"
-    "csrc/cpu/utils.cpp"
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp")
+    "csrc/cpu/pybind.cpp")

 define_gpu_extension_target(
    _C
    DESTINATION vllm
    LANGUAGE CXX
    SOURCES ${VLLM_EXT_SRC}
-    LIBRARIES ${LIBS}
    COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-    USE_SABI 3
-    WITH_SOABI
+    WITH_SOABI 
 )

+add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
+
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -5,7 +5,7 @@
 macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
  set(Python_EXECUTABLE ${EXECUTABLE})
-  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
+  find_package(Python COMPONENTS Interpreter Development.Module)
  if (NOT Python_FOUND)
    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
  endif()
@ -147,23 +147,16 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
  if (${GPU_LANG} STREQUAL "HIP")
    #
    # `GPU_ARCHES` controls the `--offload-arch` flags.
+    # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
+    # via the `PYTORCH_ROCM_ARCH` env variable.
    #
-    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
-    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
-    # "rocm_agent_enumerator" in "enable_language(HIP)"
-    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
-    #
-    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
-      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
-    else()
-      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
-    endif()
+
    #
    # Find the intersection of the supported + detected architectures to
    # set the module architecture flags.
    #
    set(${GPU_ARCHES})
-    foreach (_ARCH ${HIP_ARCHITECTURES})
+    foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
        list(APPEND ${GPU_ARCHES} ${_ARCH})
      endif()
@ -171,7 +164,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)

    if(NOT ${GPU_ARCHES})
      message(FATAL_ERROR
-        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
+        "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
    endif()

@ -181,7 +174,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
    #
    # The torch cmake setup hardcodes the detected architecture flags in
    # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
-    # can't modified on a per-target basis.
+    # can't modified on a per-target basis, e.g. for the `punica` extension.
    # So, all the `-gencode` flags need to be extracted and removed from
    # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
    # Since it's not possible to use `target_compiler_options` for adding target
@ -301,7 +294,6 @@ endmacro()
 # INCLUDE_DIRECTORIES <dirs> - Extra include directories.
 # LIBRARIES <libraries>      - Extra link libraries.
 # WITH_SOABI                 - Generate library with python SOABI suffix name.
-# USE_SABI <version>         - Use python stable api <version>
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
@ -309,7 +301,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  cmake_parse_arguments(PARSE_ARGV 1
    GPU
    "WITH_SOABI"
-    "DESTINATION;LANGUAGE;USE_SABI"
+    "DESTINATION;LANGUAGE"
    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")

  # Add hipify preprocessing step when building with HIP/ROCm.
@ -323,11 +315,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
    set(GPU_WITH_SOABI)
  endif()

-  if (GPU_USE_SABI)
-    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
-  else()
-    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
-  endif()
+  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})

  if (GPU_LANGUAGE STREQUAL "HIP")
    # Make this target dependent on the hipify preprocessor step.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Zhuohan Li	1936d7bab0	format	2024-06-02 00:02:54 +00:00
Zhuohan Li	996cf2de5c	Fix hashing logic for non-full blocks	2024-06-02 00:01:30 +00:00