diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml new file mode 100644 index 0000000000..514c15d609 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml @@ -0,0 +1,14 @@ +model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8" +tasks: + - name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.82 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +num_fewshot: 5 +enforce_eager: false # we use false to speed up the eval process +kv_cache_dtype: fp8 # we use fp8 to speed up the eval process +max_model_len: 40960 +apply_chat_template: true +fewshot_as_multiturn: true +gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>" diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt deleted file mode 100644 index 4fb0b84bc4..0000000000 --- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt +++ /dev/null @@ -1 +0,0 @@ -Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt new file mode 100644 index 0000000000..5552391d9e --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt @@ -0,0 +1 @@ +Qwen3-235B-A22B-Instruct-2507-FP8.yaml diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index f10de82b1d..3627b760ed 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -21,10 +21,13 @@ def launch_lm_eval(eval_config, tp_size): max_model_len = eval_config.get("max_model_len", 4096) batch_size = eval_config.get("batch_size", "auto") backend = eval_config.get("backend", "vllm") + enforce_eager = eval_config.get("enforce_eager", "true") + kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto") model_args = ( f"pretrained={eval_config['model_name']}," f"tensor_parallel_size={tp_size}," - f"enforce_eager=true," + f"enforce_eager={enforce_eager}," + f"kv_cache_dtype={kv_cache_dtype}," f"add_bos_token=true," f"trust_remote_code={trust_remote_code}," f"max_model_len={max_model_len}," @@ -37,8 +40,13 @@ def launch_lm_eval(eval_config, tp_size): limit=eval_config["limit"], # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help # text models. however, this is regressing measured strict-match for - # existing text models in CI, so only apply it for mm. - apply_chat_template=backend == "vllm-vlm", + # existing text models in CI, so only apply it for mm, or explicitly set + apply_chat_template=eval_config.get( + "apply_chat_template", backend == "vllm-vlm" + ), + fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), + # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) + gen_kwargs=eval_config.get("gen_kwargs"), batch_size=batch_size, ) return results diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml deleted file mode 100644 index 4259514940..0000000000 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ /dev/null @@ -1,184 +0,0 @@ -steps: - - label: "Wait for container to be ready" - key: wait-for-container-image - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - containers: - - image: badouralix/curl-jq - command: - - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh - - label: "Cleanup H100" - agents: - queue: H100 - depends_on: ~ - command: docker system prune -a --volumes --force - - - label: "A100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: A100 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - - label: "H200" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H200 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: 4,5,6,7 - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - #- block: "Run H100 Benchmark" - #key: block-h100 - #depends_on: ~ - - - label: "H100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H100 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - # Premerge benchmark - - label: "A100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: A100 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - - label: "H200" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H200 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: 4,5,6,7 - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - #- block: "Run H100 Benchmark" - #key: block-h100 - #depends_on: ~ - - - label: "H100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H100 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md deleted file mode 100644 index 466def07b6..0000000000 --- a/.buildkite/nightly-benchmarks/nightly-annotation.md +++ /dev/null @@ -1,28 +0,0 @@ -# Nightly benchmark annotation - -## Description - -This file contains the downloading link for benchmarking results. - -- [benchmarking pipeline](artifact://nightly-pipeline.yaml) -- [benchmarking results](artifact://results.zip) -- [benchmarking code](artifact://nightly-benchmarks.zip) - -Please download the visualization scripts in the post - -## Results reproduction - -- Find the docker we use in `benchmarking pipeline` -- Deploy the docker, and inside the docker: - - Download `nightly-benchmarks.zip`. - - In the same folder, run the following code: - - ```bash - export HF_TOKEN= - apt update - apt install -y git - unzip nightly-benchmarks.zip - VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - ``` - -And the results will be inside `./benchmarks/results`. diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md deleted file mode 100644 index 2ef36089b6..0000000000 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ /dev/null @@ -1,39 +0,0 @@ - -# Nightly benchmark - -This benchmark aims to: - -- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload. -- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions. - -Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. - -Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176) - -## Setup - -- Docker images: - - vLLM: `vllm/vllm-openai:v0.6.2` - - SGLang: `lmsysorg/sglang:v0.3.2-cu121` - - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` - - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` - - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.* - - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. -- Hardware - - 8x Nvidia A100 GPUs -- Workload: - - Dataset - - ShareGPT dataset - - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) - - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) - - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. - - Models: llama-3 8B, llama-3 70B. - - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). - - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. - - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. - - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). - -## Known issues - -- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105). -- TGI does not support `ignore-eos` flag. diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml deleted file mode 100644 index 199517e8b0..0000000000 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ /dev/null @@ -1,196 +0,0 @@ -common_pod_spec: &common_pod_spec - priorityClassName: perf-benchmark - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory - -common_container_settings: &common_container_settings - command: - - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - -steps: - - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - - - - - label: "A100 vllm step 10" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: vllm/vllm-openai:v0.6.2 - <<: *common_container_settings - - - - - label: "A100 sglang benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: lmsysorg/sglang:v0.3.2-cu121 - <<: *common_container_settings - - - label: "A100 lmdeploy benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: openmmlab/lmdeploy:v0.6.1-cu12 - <<: *common_container_settings - - - - - - label: "A100 trt llama-8B" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - <<: *common_container_settings - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - name: TEST_SELECTOR - value: "llama8B" - - - - label: "A100 trt llama-70B" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - <<: *common_container_settings - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - name: TEST_SELECTOR - value: "llama70B" - - - # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image - # - label: "A100 trt benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # <<: *common_pod_spec - # containers: - # - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - # <<: *common_container_settings - - - # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`. - # - label: "A100 tgi benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # <<: *common_pod_spec - # containers: - # - image: ghcr.io/huggingface/text-generation-inference:2.2.0 - # <<: *common_container_settings - - - wait - - - label: "Collect the results" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: vllm/vllm-openai:v0.5.0.post1 - command: - - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - - block: ":rocket: check the results!" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py deleted file mode 100644 index 8532ff7ef7..0000000000 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse - -from transformers import AutoTokenizer - - -def main(model, cachedir): - # Load the tokenizer and save it to the specified directory - tokenizer = AutoTokenizer.from_pretrained(model) - tokenizer.save_pretrained(cachedir) - print(f"Tokenizer saved to {cachedir}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Download and save Hugging Face tokenizer" - ) - parser.add_argument("--model", type=str, required=True, help="Name of the model") - parser.add_argument( - "--cachedir", type=str, required=True, help="Directory to save the tokenizer" - ) - - args = parser.parse_args() - main(args.model, args.cachedir) diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py deleted file mode 100644 index 053fd52c35..0000000000 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import json -from pathlib import Path - -import numpy as np -import pandas as pd -from tabulate import tabulate - - -def parse_arguments(): - parser = argparse.ArgumentParser( - description="Parse command line arguments for summary-nightly-results script." - ) - parser.add_argument( - "--results-folder", - type=str, - required=True, - help="The folder where the results are stored.", - ) - parser.add_argument( - "--description", type=str, required=True, help="Description of the results." - ) - - args = parser.parse_args() - return args - - -def get_perf(df, method, model, metric): - means = [] - - for qps in [2, 4, 8, 16, "inf"]: - target = df["Test name"].str.contains(model) - target = target & df["Engine"].str.contains(method) - target = target & df["Test name"].str.contains("qps_" + str(qps)) - filtered_df = df[target] - - if filtered_df.empty: - means.append(0.0) - else: - means.append(filtered_df[metric].values[0]) - - return np.array(means) - - -def get_perf_w_std(df, method, model, metric): - if metric in ["TTFT", "ITL"]: - mean = get_perf(df, method, model, "Mean " + metric + " (ms)") - mean = mean.tolist() - std = get_perf(df, method, model, "Std " + metric + " (ms)") - if std.mean() == 0: - std = None - success = get_perf(df, method, model, "Successful req.") - if std is not None: - std = std / np.sqrt(success) - std = std.tolist() - - else: - assert metric == "Tput" - mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( - df, method, model, "Output Tput (tok/s)" - ) - mean = mean.tolist() - std = None - - return mean, std - - -def main(args): - results_folder = Path(args.results_folder) - - results = [] - - # collect results - for test_file in results_folder.glob("*_nightly_results.json"): - with open(test_file) as f: - results = results + json.loads(f.read()) - - # generate markdown table - df = pd.DataFrame.from_dict(results) - - md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False) - - with open(args.description) as f: - description = f.read() - - description = description.format(nightly_results_benchmarking_table=md_table) - - with open("nightly_results.md", "w") as f: - f.write(description) - - -if __name__ == "__main__": - args = parse_arguments() - main(args) diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py deleted file mode 100644 index ddea1d2b1b..0000000000 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from lmdeploy.serve.openai.api_client import APIClient - -api_client = APIClient("http://localhost:8000") -model_name = api_client.available_models[0] - -print(model_name) diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh deleted file mode 100644 index 69b6b146b3..0000000000 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -set -ex -set -o pipefail - - -main() { - - (which wget && which curl) || (apt-get update && apt-get install -y wget curl) - (which jq) || (apt-get update && apt-get -y install jq) - (which zip) || (apt-get install -y zip) - - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip plotting the results." - exit 0 - fi - - # initial annotation - #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" - - # download results - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - mkdir -p results/ - /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ - ls - ls results/ - - # upload benchmark results - zip -r results.zip results/ - /workspace/buildkite-agent artifact upload "results.zip" - - # upload benchmarking scripts - cd "$VLLM_SOURCE_CODE_LOC/" - zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ - /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" - - cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - # upload benchmarking pipeline - /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" - - cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md - - - - # The figures should be generated by a separate process outside the CI/CD pipeline - - # # generate figures - # python3 -m pip install tabulate pandas matplotlib - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \ - # --description $description \ - # --results-folder results/ - - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sharegpt - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sonnet_2048_128 - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sonnet_128_2048 - - # # upload results and figures - # /workspace/buildkite-agent artifact upload "nightly_results*.png" - # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml - # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json - # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh deleted file mode 100644 index a00de940cb..0000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ /dev/null @@ -1,464 +0,0 @@ -#!/bin/bash - -set -o pipefail -set -x - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')" - echo "GPU type is $gpu_type" -} - -check_hf_token() { - # check if HF_TOKEN is available and valid - if [[ -z "$HF_TOKEN" ]]; then - echo "Error: HF_TOKEN is not set." - exit 1 - elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then - echo "Error: HF_TOKEN does not start with 'hf_'." - exit 1 - else - echo "HF_TOKEN is set and valid." - fi -} - - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - - -get_current_llm_serving_engine() { - - if which lmdeploy >/dev/null; then - echo "Container: lmdeploy" - export CURRENT_LLM_SERVING_ENGINE=lmdeploy - return - fi - - if [ -e /tgi-entrypoint.sh ]; then - echo "Container: tgi" - export CURRENT_LLM_SERVING_ENGINE=tgi - return - fi - - if which trtllm-build >/dev/null; then - echo "Container: tensorrt-llm" - export CURRENT_LLM_SERVING_ENGINE=trt - return - fi - - if [ -e /sgl-workspace ]; then - echo "Container: sglang" - export CURRENT_LLM_SERVING_ENGINE=sglang - return - fi - - if [ -e /vllm-workspace ]; then - echo "Container: vllm" - # move to a completely irrelevant directory, to avoid import vllm from current folder - export CURRENT_LLM_SERVING_ENGINE=vllm - - return - fi -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -kill_gpu_processes() { - pkill -f '[p]ython' - pkill -f '[p]ython3' - pkill -f '[t]ritonserver' - pkill -f '[p]t_main_thread' - pkill -f '[t]ext-generation' - pkill -f '[l]mdeploy' - # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 - pkill -f '[V]LLM' - - while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do - sleep 1 - done -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - timeout 1200 bash -c ' - until curl -s localhost:8000/v1/completions > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -ensure_installed() { - # Ensure that the given command is installed by apt-get - local cmd=$1 - if ! which "$cmd" >/dev/null; then - apt-get update && apt-get install -y "$cmd" - fi -} - -run_serving_tests() { - # run serving tests using `vllm bench serve` command - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # prepend the current serving engine to the test name - test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - reuse_server=$(echo "$common_params" | jq -r '.reuse_server') - - # get client and server arguments - server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") - client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters") - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if [[ $reuse_server == "true" ]]; then - echo "Reuse previous server for test case $test_name" - else - kill_gpu_processes - bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ - "$server_params" "$common_params" - fi - - if wait_for_server; then - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." - else - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." - break - fi - - # prepare tokenizer - # this is required for lmdeploy. - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - rm -rf /tokenizer_cache - mkdir /tokenizer_cache - python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ - --model "$model" \ - --cachedir /tokenizer_cache - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - - - # change model name for lmdeploy (it will not follow standard hf name) - if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then - model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - backend=$CURRENT_LLM_SERVING_ENGINE - - if [[ $backend = "trt" ]]; then - backend="tensorrt-llm" - fi - - if [[ "$backend" == *"vllm"* ]]; then - backend="vllm" - fi - - if [[ "$dataset_name" = "sharegpt" ]]; then - - client_command="vllm bench serve \ - --backend $backend \ - --tokenizer /tokenizer_cache \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --ignore-eos \ - $client_args" - - elif [[ "$dataset_name" = "sonnet" ]]; then - - sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len') - sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') - sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') - - client_command="vllm bench serve \ - --backend $backend \ - --tokenizer /tokenizer_cache \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --sonnet-input-len $sonnet_input_len \ - --sonnet-output-len $sonnet_output_len \ - --sonnet-prefix-len $sonnet_prefix_len \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --ignore-eos \ - $client_args" - - else - - echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." - exit 1 - - fi - - - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - server_command="None" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "$CURRENT_LLM_SERVING_ENGINE" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - done - - kill_gpu_processes -} - -run_genai_perf_tests() { - # run genai-perf tests - - # $1: a json file specifying genai-perf test cases - local genai_perf_test_file - genai_perf_test_file=$1 - - # Iterate over genai-perf tests - jq -c '.[]' "$genai_perf_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # prepend the current serving engine to the test name - test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - reuse_server=$(echo "$common_params" | jq -r '.reuse_server') - - # get client and server arguments - server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if [[ $reuse_server == "true" ]]; then - echo "Reuse previous server for test case $test_name" - else - kill_gpu_processes - bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ - "$server_params" "$common_params" - fi - - if wait_for_server; then - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." - else - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." - break - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps=$num_prompts - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - backend=$CURRENT_LLM_SERVING_ENGINE - - if [[ "$backend" == *"vllm"* ]]; then - backend="vllm" - fi - #TODO: add output dir. - client_command="genai-perf profile \ - -m $model \ - --service-kind openai \ - --backend "$backend" \ - --endpoint-type chat \ - --streaming \ - --url localhost:$port \ - --request-rate $qps \ - --num-prompts $num_prompts \ - " - - echo "Client command: $client_command" - - eval "$client_command" - - #TODO: process/record outputs - done - done - - kill_gpu_processes - -} - -prepare_dataset() { - - # download sharegpt dataset - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - # duplicate sonnet by 4x, to allow benchmarking with input length 2048 - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - echo "" > sonnet_4x.txt - for _ in {1..4} - do - cat sonnet.txt >> sonnet_4x.txt - done - -} - -main() { - - # check if the environment variable is successfully injected from yaml - - check_gpus - check_hf_token - get_current_llm_serving_engine - - pip install -U transformers - - pip install -r requirements/dev.txt - which genai-perf - - # check storage - df -h - - ensure_installed wget - ensure_installed curl - ensure_installed jq - # genai-perf dependency - ensure_installed libb64-0d - - prepare_dataset - - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - - # run the test - run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" - - # run genai-perf tests - run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" - mv artifacts/ $RESULTS_FOLDER/ - - # upload benchmark results to buildkite - python3 -m pip install tabulate pandas - python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py deleted file mode 100644 index fb3b9d5e34..0000000000 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ /dev/null @@ -1,82 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import datetime -import json -import os -from pathlib import Path - -import pandas as pd -from tabulate import tabulate - -results_folder = Path("results/") - -# serving results and the keys that will be printed into markdown -serving_results = [] -serving_column_mapping = { - "test_name": "Test name", - "gpu_type": "GPU", - "completed": "Successful req.", - "request_throughput": "Tput (req/s)", - "mean_ttft_ms": "Mean TTFT (ms)", - "std_ttft_ms": "Std TTFT (ms)", - "median_ttft_ms": "Median TTFT (ms)", - "mean_itl_ms": "Mean ITL (ms)", - "std_itl_ms": "Std ITL (ms)", - "median_itl_ms": "Median ITL (ms)", - "mean_tpot_ms": "Mean TPOT (ms)", - "std_tpot_ms": "Std TPOT (ms)", - "median_tpot_ms": "Median TPOT (ms)", - "total_token_throughput": "Total Token Tput (tok/s)", - "output_throughput": "Output Tput (tok/s)", - "total_input_tokens": "Total input tokens", - "total_output_tokens": "Total output tokens", - "engine": "Engine", -} - -if __name__ == "__main__": - # collect results - for test_file in results_folder.glob("*.json"): - with open(test_file) as f: - raw_result = json.loads(f.read()) - - # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands")) as f: - command = json.loads(f.read()) - raw_result.update(command) - - # update the test name of this result - raw_result.update({"test_name": test_file.stem}) - - # add the result to raw_result - serving_results.append(raw_result) - continue - - serving_results = pd.DataFrame.from_dict(serving_results) - - if not serving_results.empty: - serving_results = serving_results[list(serving_column_mapping.keys())].rename( - columns=serving_column_mapping - ) - - serving_md_table_with_headers = tabulate( - serving_results, headers="keys", tablefmt="pipe", showindex=False - ) - # remove the first line of header - serving_md_table_lines = serving_md_table_with_headers.split("\n") - serving_md_table_without_header = "\n".join(serving_md_table_lines[2:]) - - prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") - - # document benchmarking results in markdown - with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: - # document results with header. - # for those who wants to reproduce our benchmark. - f.write(serving_md_table_with_headers) - f.write("\n") - - # document benchmarking results in json - with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: - results = serving_results.to_dict(orient="records") - f.write(json.dumps(results)) diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh deleted file mode 100644 index 50e1ab0242..0000000000 --- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh -TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) -if [[ "$BUILDKITE_BRANCH" == "main" ]]; then - URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" -else - URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" -fi - -TIMEOUT_SECONDS=10 - -retries=0 -while [ $retries -lt 1000 ]; do - if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then - exit 0 - fi - - echo "Waiting for image to be available..." - - retries=$((retries + 1)) - sleep 5 -done - -exit 1 diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md similarity index 69% rename from .buildkite/nightly-benchmarks/README.md rename to .buildkite/performance-benchmarks/README.md index e6f5c8b60f..332142ba5d 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -2,40 +2,23 @@ ## Introduction -This directory contains two sets of benchmark for vllm. - -- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance -- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. - -See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. +This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance. +vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD. ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models. **Benchmarking Duration**: about 1hr. **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. -## Nightly benchmark quick overview - -**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. - -**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy. - -**Benchmarking Duration**: about 3.5hrs. - ## Trigger the benchmark -Performance benchmark will be triggered when: - -- A PR being merged into vllm. -- Every commit for those PRs with `perf-benchmarks` label AND `ready` label. - -Manually Trigger the benchmark +The benchmark needs to be triggered manually: ```bash -bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh ``` Runtime environment variables: @@ -47,10 +30,6 @@ Runtime environment variables: - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. -Nightly benchmark will be triggered when: - -- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. - ## Performance benchmark details See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. @@ -152,26 +131,3 @@ Here is an example using the script to compare result_a and result_b with Model, A comparison diagram will be generated below the table. Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3 image - -## Nightly test details - -See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. - -### Workflow - -- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. -- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container. -- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`. -- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. - -### Nightly tests - -In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark. - -### Docker containers - -The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. - -WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`. - -WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md similarity index 100% rename from .buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md rename to .buildkite/performance-benchmarks/performance-benchmarks-descriptions.md diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py similarity index 100% rename from .buildkite/nightly-benchmarks/scripts/compare-json-results.py rename to .buildkite/performance-benchmarks/scripts/compare-json-results.py diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py similarity index 99% rename from .buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py rename to .buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py index a7544aeef4..80bb4d846a 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py @@ -392,7 +392,7 @@ if __name__ == "__main__": json_file = "benchmark_results.json" with open(results_folder / md_file, "w") as f: results = read_markdown( - "../.buildkite/nightly-benchmarks/" + "../.buildkite/performance-benchmarks/" + "performance-benchmarks-descriptions.md" ) results = results.format( diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/performance-benchmarks/scripts/launch-server.sh similarity index 100% rename from .buildkite/nightly-benchmarks/scripts/launch-server.sh rename to .buildkite/performance-benchmarks/scripts/launch-server.sh diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh similarity index 99% rename from .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh rename to .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh index 5a47576483..9447ceffd7 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -469,7 +469,7 @@ main() { ensure_sharegpt_downloaded declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER - QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/ # dump vllm info via vllm collect-env env_output=$(vllm collect-env) diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/genai-perf-tests.json rename to .buildkite/performance-benchmarks/tests/genai-perf-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/latency-tests-cpu.json rename to .buildkite/performance-benchmarks/tests/latency-tests-cpu.json diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/performance-benchmarks/tests/latency-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/latency-tests.json rename to .buildkite/performance-benchmarks/tests/latency-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/performance-benchmarks/tests/nightly-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/nightly-tests.json rename to .buildkite/performance-benchmarks/tests/nightly-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests.json rename to .buildkite/performance-benchmarks/tests/serving-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json rename to .buildkite/performance-benchmarks/tests/throughput-tests-cpu.json diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/performance-benchmarks/tests/throughput-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/throughput-tests.json rename to .buildkite/performance-benchmarks/tests/throughput-tests.json diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 250a64fdd0..27ed67c451 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -20,7 +20,10 @@ trap remove_docker_container EXIT # Run the image and test offline inference/tensor parallel docker run \ - --device /dev/dri \ + --device /dev/dri:/dev/dri \ + --net=host \ + --ipc=host \ + --privileged \ -v /dev/dri/by-path:/dev/dri/by-path \ --entrypoint="" \ -e "HF_TOKEN=${HF_TOKEN}" \ @@ -42,7 +45,7 @@ docker run \ pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py pytest -v -s v1/test_serial_utils.py ' diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 524d2e121a..35bd4c99ad 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -38,7 +38,7 @@ steps: - label: Pytorch Nightly Dependency Override Check # 2min # if this test fails, it means the nightly torch version is not compatible with some # of the dependencies. Please check the error message and add the package to whitelist - # in /vllm/tools/generate_nightly_torch_test.py + # in /vllm/tools/pre_commit/generate_nightly_torch_test.py mirror_hardwares: [amdexperimental] agent_pool: mi325_1 # grade: Blocking @@ -286,7 +286,7 @@ steps: - label: Engine Test # 25min timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 #grade: Blocking source_file_dependencies: @@ -318,7 +318,7 @@ steps: - label: V1 Test entrypoints # 35min timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: @@ -908,7 +908,7 @@ steps: - label: Quantized Models Test # 45 min timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 03268beecf..339e3aab6c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -38,7 +38,7 @@ steps: - label: Pytorch Nightly Dependency Override Check # 2min # if this test fails, it means the nightly torch version is not compatible with some # of the dependencies. Please check the error message and add the package to whitelist - # in /vllm/tools/generate_nightly_torch_test.py + # in /vllm/tools/pre_commit/generate_nightly_torch_test.py soft_fail: true source_file_dependencies: - requirements/nightly_torch_test.txt @@ -205,6 +205,24 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd +- label: Distributed Tests (8 GPUs) # 4min + timeout_in_minutes: 10 + gpu: h100 + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and dp=4 with ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - label: EPLB Algorithm Test # 5min timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" @@ -401,7 +419,7 @@ steps: --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss.py \ --ignore=lora/test_qwen3moe_tp.py - + parallelism: 4 - label: PyTorch Compilation Unit Tests # 15min @@ -498,6 +516,8 @@ steps: - tests/kernels/moe - vllm/model_executor/layers/fused_moe/ - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config commands: - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 @@ -1124,7 +1144,7 @@ steps: - tests/weight_loading commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - + - label: NixlConnector PD accuracy tests (Distributed) # 30min timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" @@ -1166,6 +1186,19 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 +##### H100 test ##### +- label: LM Eval Large Models (H100) # optional + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + ##### H200 test ##### - label: Distributed Tests (H200) # optional gpu: h200 diff --git a/.github/mergify.yml b/.github/mergify.yml index de1a8314a4..18d4a2e831 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -108,7 +108,7 @@ pull_request_rules: - files~=^benchmarks/ - files~=^vllm/benchmarks/ - files~=^tests/benchmarks/ - - files~=^\.buildkite/nightly-benchmarks/ + - files~=^\.buildkite/performance-benchmarks/ actions: label: add: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fbfd8016cb..bcd40e7f8a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -45,7 +45,7 @@ repos: - id: format-torch-nightly-test name: reformat nightly_torch_test.txt to be in sync with test.in language: python - entry: python tools/generate_nightly_torch_test.py + entry: python tools/pre_commit/generate_nightly_torch_test.py files: ^requirements/test\.(in|txt)$ - id: mypy-local name: Run mypy locally for lowest supported Python version @@ -78,12 +78,12 @@ repos: stages: [manual] # Only run in CI - id: shellcheck name: Lint shell scripts - entry: tools/shellcheck.sh + entry: tools/pre_commit/shellcheck.sh language: script types: [shell] - id: png-lint name: Lint PNG exports from excalidraw - entry: tools/png-lint.sh + entry: tools/pre_commit/png-lint.sh language: script types: [png] - id: signoff-commit @@ -100,12 +100,12 @@ repos: stages: [commit-msg] - id: check-spdx-header name: Check SPDX headers - entry: python tools/check_spdx_header.py + entry: python tools/pre_commit/check_spdx_header.py language: python types: [python] - id: check-root-lazy-imports name: Check root lazy imports - entry: python tools/check_init_lazy_imports.py + entry: python tools/pre_commit/check_init_lazy_imports.py language: python types: [python] - id: check-filenames @@ -119,11 +119,11 @@ repos: pass_filenames: false - id: update-dockerfile-graph name: Update Dockerfile dependency graph - entry: tools/update-dockerfile-graph.sh + entry: tools/pre_commit/update-dockerfile-graph.sh language: script - id: enforce-import-regex-instead-of-re name: Enforce import regex as re - entry: python tools/enforce_regex_import.py + entry: python tools/pre_commit/enforce_regex_import.py language: python types: [python] pass_filenames: false @@ -131,7 +131,7 @@ repos: # forbid directly import triton - id: forbid-direct-triton-import name: "Forbid direct 'import triton'" - entry: python tools/check_triton_import.py + entry: python tools/pre_commit/check_triton_import.py language: python types: [python] pass_filenames: false @@ -144,7 +144,7 @@ repos: additional_dependencies: [regex] - id: validate-config name: Validate configuration has default values and that each field has a docstring - entry: python tools/validate_config.py + entry: python tools/pre_commit/validate_config.py language: python additional_dependencies: [regex] # Keep `suggestion` last diff --git a/docker/Dockerfile b/docker/Dockerfile index eb1453126e..42a830cb60 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -495,7 +495,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ else \ BITSANDBYTES_VERSION="0.46.1"; \ fi; \ - uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.14.0' + uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0' ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/docs/cli/.nav.yml b/docs/cli/.nav.yml index 6c2c09d566..d2d2905703 100644 --- a/docs/cli/.nav.yml +++ b/docs/cli/.nav.yml @@ -5,4 +5,4 @@ nav: - complete.md - run-batch.md - vllm bench: - - bench/*.md + - bench/**/*.md diff --git a/docs/cli/bench/sweep/plot.md b/docs/cli/bench/sweep/plot.md new file mode 100644 index 0000000000..f29bffb646 --- /dev/null +++ b/docs/cli/bench/sweep/plot.md @@ -0,0 +1,9 @@ +# vllm bench sweep plot + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/bench_sweep_plot.md" diff --git a/docs/cli/bench/sweep/serve.md b/docs/cli/bench/sweep/serve.md new file mode 100644 index 0000000000..5b5f91a951 --- /dev/null +++ b/docs/cli/bench/sweep/serve.md @@ -0,0 +1,9 @@ +# vllm bench sweep serve + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/bench_sweep_serve.md" diff --git a/docs/cli/bench/sweep/serve_sla.md b/docs/cli/bench/sweep/serve_sla.md new file mode 100644 index 0000000000..5f8ab6005e --- /dev/null +++ b/docs/cli/bench/sweep/serve_sla.md @@ -0,0 +1,9 @@ +# vllm bench sweep serve_sla + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Options + +--8<-- "docs/argparse/bench_sweep_serve_sla.md" diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index e8b58dbbc9..dca01eab5b 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -9,7 +9,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu - **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing - **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations - **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development -- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives [Benchmark CLI]: #benchmark-cli @@ -1061,7 +1060,7 @@ Follow these steps to run the script: Example command: ```bash -python -m vllm.benchmarks.sweep.serve \ +vllm bench sweep serve \ --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \ --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \ --serve-params benchmarks/serve_hparams.json \ @@ -1109,7 +1108,7 @@ For example, to ensure E2E latency within different target values for 99% of req Example command: ```bash -python -m vllm.benchmarks.sweep.serve_sla \ +vllm bench sweep serve_sla \ --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \ --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \ --serve-params benchmarks/serve_hparams.json \ @@ -1138,7 +1137,7 @@ The algorithm for adjusting the SLA variable is as follows: Example command: ```bash -python -m vllm.benchmarks.sweep.plot benchmarks/results/ \ +vllm bench sweep plot benchmarks/results/ \ --var-x max_concurrency \ --row-by random_input_len \ --col-by random_output_len \ @@ -1167,7 +1166,7 @@ docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingf Then, run below command inside the docker instance. ```bash -bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh ``` When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json. @@ -1185,7 +1184,7 @@ For more results visualization, check the [visualizing the results](https://gith The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). -More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). +More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md). ### Continuous Benchmarking @@ -1210,11 +1209,3 @@ The benchmarking currently runs on a predefined set of models configured in the #### Viewing Results All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). - -## Nightly Benchmarks - -These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels. - -The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html). - -More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md). diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index 54031ec368..abffb7bc5f 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -49,11 +49,14 @@ First, create a Kubernetes PVC and Secret for downloading and storing Hugging Fa metadata: name: hf-token-secret type: Opaque - data: - token: $(HF_TOKEN) + stringData: + token: "REPLACE_WITH_TOKEN" EOF ``` +Here, the `token` field stores your **Hugging Face access token**. For details on how to generate a token, +see the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens). + Next, start the vLLM server as a Kubernetes Deployment and Service: ??? console "Config" diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index fb64a7bb9c..2f4b17f191 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters. The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py). -An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples. +An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples. ## Using an IO Processor plugin diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md index 9156df9db6..620a660a24 100644 --- a/docs/getting_started/installation/gpu.xpu.inc.md +++ b/docs/getting_started/installation/gpu.xpu.inc.md @@ -56,8 +56,10 @@ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . docker run -it \ --rm \ --network=host \ - --device /dev/dri \ + --device /dev/dri:/dev/dri \ -v /dev/dri/by-path:/dev/dri/by-path \ + --ipc=host \ + --privileged \ vllm-xpu-env ``` diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 99d9a7bec3..ea89108f01 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -56,15 +56,20 @@ def auto_mock(module, attr, max_mocks=50): ) -latency = auto_mock("vllm.benchmarks", "latency") -serve = auto_mock("vllm.benchmarks", "serve") -throughput = auto_mock("vllm.benchmarks", "throughput") +bench_latency = auto_mock("vllm.benchmarks", "latency") +bench_serve = auto_mock("vllm.benchmarks", "serve") +bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs") +bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs") +bench_sweep_serve_sla = auto_mock( + "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs" +) +bench_throughput = auto_mock("vllm.benchmarks", "throughput") AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs") EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs") ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand") CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand") -cli_args = auto_mock("vllm.entrypoints.openai", "cli_args") -run_batch = auto_mock("vllm.entrypoints.openai", "run_batch") +openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args") +openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch") FlexibleArgumentParser = auto_mock( "vllm.utils.argparse_utils", "FlexibleArgumentParser" ) @@ -114,6 +119,9 @@ class MarkdownFormatter(HelpFormatter): self._markdown_output.append(f"{action.help}\n\n") if (default := action.default) != SUPPRESS: + # Make empty string defaults visible + if default == "": + default = '""' self._markdown_output.append(f"Default: `{default}`\n\n") def format_help(self): @@ -150,17 +158,23 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): # Create parsers to document parsers = { + # Engine args "engine_args": create_parser(EngineArgs.add_cli_args), "async_engine_args": create_parser( AsyncEngineArgs.add_cli_args, async_args_only=True ), - "serve": create_parser(cli_args.make_arg_parser), + # CLI + "serve": create_parser(openai_cli_args.make_arg_parser), "chat": create_parser(ChatCommand.add_cli_args), "complete": create_parser(CompleteCommand.add_cli_args), - "bench_latency": create_parser(latency.add_cli_args), - "bench_throughput": create_parser(throughput.add_cli_args), - "bench_serve": create_parser(serve.add_cli_args), - "run-batch": create_parser(run_batch.make_arg_parser), + "run-batch": create_parser(openai_run_batch.make_arg_parser), + # Benchmark CLI + "bench_latency": create_parser(bench_latency.add_cli_args), + "bench_serve": create_parser(bench_serve.add_cli_args), + "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args), + "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args), + "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args), + "bench_throughput": create_parser(bench_throughput.add_cli_args), } # Generate documentation for each parser diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md index c2cf107263..fc9d5eec38 100644 --- a/docs/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -45,6 +45,15 @@ vllm serve s3://core-llm/Llama-3-8b \ You can tune parameters using `--model-loader-extra-config`: +You can tune `distributed` that controls whether distributed streaming should be used. This is currently only possible on CUDA and ROCM devices. This can significantly improve loading times from object storage or high-throughput network fileshares. +You can read further about Distributed streaming [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/usage.md#distributed-streaming) + +```bash +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ + --load-format runai_streamer \ + --model-loader-extra-config '{"distributed":true}' +``` + You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer. For reading from S3, it will be the number of client instances the host is opening to the S3 server. diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 40651be1d4..18bb645ea9 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -30,11 +30,11 @@ If `--runner pooling` has been set (manually or automatically) but the model doe vLLM will attempt to automatically convert the model according to the architecture names shown in the table below. -| Architecture | `--convert` | Supported pooling tasks | -|-------------------------------------------------|-------------|-------------------------------| -| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `encode`, `embed` | -| `*For*Classification`, `*ClassificationModel` | `classify` | `encode`, `classify`, `score` | -| `*ForRewardModeling`, `*RewardModel` | `reward` | `encode` | +| Architecture | `--convert` | Supported pooling tasks | +|-------------------------------------------------|-------------|---------------------------------------| +| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` | +| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` | +| `*ForRewardModeling`, `*RewardModel` | `reward` | `token_classify` | !!! tip You can explicitly set `--convert ` to specify how to convert the model. @@ -45,12 +45,14 @@ Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks], enabling the corresponding APIs: -| Task | APIs | -|------------|--------------------------------------| -| `encode` | `LLM.reward(...)` | -| `embed` | `LLM.embed(...)`, `LLM.score(...)`\* | -| `classify` | `LLM.classify(...)` | -| `score` | `LLM.score(...)` | +| Task | APIs | +|------------------|-------------------------------------------------------------------------------| +| `embed` | `LLM.embed(...)`, `LLM.score(...)`\*, `LLM.encode(..., pooling_task="embed")` | +| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")` | +| `score` | `LLM.score(...)` | +| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` | +| `token_embed` | `LLM.encode(..., pooling_task="token_embed")` | +| `plugin` | `LLM.encode(..., pooling_task="plugin")` | \* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task. @@ -144,7 +146,6 @@ A code example can be found here: [examples/offline_inference/basic/score.py](.. ### `LLM.reward` The [reward][vllm.LLM.reward] method is available to all reward models in vLLM. -It returns the extracted hidden states directly. ```python from vllm import LLM @@ -161,15 +162,17 @@ A code example can be found here: [examples/offline_inference/basic/reward.py](. ### `LLM.encode` The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. -It returns the extracted hidden states directly. !!! note Please use one of the more specific methods or set the task directly when using `LLM.encode`: - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`. - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`. - - For rewards, use `LLM.reward(...)` or `pooling_task="reward"`. - For similarity scores, use `LLM.score(...)`. + - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`. + - For token classification, use `pooling_task="token_classify"`. + - For multi-vector retrieval, use `pooling_task="token_embed"` + - For IO Processor Plugins , use `pooling_task="plugin"` ```python from vllm import LLM @@ -185,10 +188,47 @@ print(f"Data: {data!r}") Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: -- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. - [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models. - [Classification API](../serving/openai_compatible_server.md#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models. - [Score API](../serving/openai_compatible_server.md#score-api) is similar to `LLM.score` for cross-encoder models. +- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. + +!!! note + Please use one of the more specific methods or set the task directly when using [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.: + + - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`. + - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`. + - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api). + - For rewards, `task":"token_classify"`. + - For token classification, use `task":"token_classify"`. + - For multi-vector retrieval, use `task":"token_embed"` + - For IO Processor Plugins , use `task":"plugin"` + +```python +# start a supported embeddings model server with `vllm serve`, e.g. +# vllm serve intfloat/e5-small +import requests + +host = "localhost" +port = "8000" +model_name = "intfloat/e5-small" + +api_url = f"http://{host}:{port}/pooling" + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +prompt = {"model": model_name, "input": prompts, "task": "embed"} + +response = requests.post(api_url, json=prompt) + +for output in response.json()["data"]: + data = output["data"] + print(f"Data: {data!r} (size={len(data)})") +``` ## Matryoshka Embeddings @@ -265,3 +305,16 @@ Expected output: ``` An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py) + +## Deprecated Features + +### Encode task + +We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`: + +- `token_embed` is the same as embed, using normalize as activation. +- `token_classify` is the same as classify, default using softmax as activation. + +### Remove softmax from PoolingParams + +We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 4d50c809d1..fd25647dce 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -382,6 +382,7 @@ th { | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | +| `KimiLinearForCausalLM` | Kimi-Linear-48B-A3B-Base, Kimi-Linear-48B-A3B-Instruct | `moonshotai/Kimi-Linear-48B-A3B-Base`, `moonshotai/Kimi-Linear-48B-A3B-Instruct` | | ✅︎ | | `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | | `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ | | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | @@ -402,6 +403,7 @@ th { | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | +| `OuroForCausalLM` | ouro | `ByteDance/Ouro-1.4B`, `ByteDance/Ouro-2.6B`, etc. | ✅︎ | | | `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | | `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 1414718a69..e331b3422e 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -638,7 +638,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -Code example: [examples/online_serving/openai_cross_encoder_score.py](../../examples/online_serving/openai_cross_encoder_score.py) +Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py) #### Single inference @@ -819,7 +819,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including print("Scoring output:", response_json["data"][0]["score"]) print("Scoring output:", response_json["data"][1]["score"]) ``` -Full example: [examples/online_serving/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/openai_cross_encoder_score_for_multimodal.py) +Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py) #### Extra parameters diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md index cd9717122b..ad78be3871 100644 --- a/examples/offline_inference/pooling/README.md +++ b/examples/offline_inference/pooling/README.md @@ -38,6 +38,18 @@ python examples/offline_inference/pooling/multi_vector_retrieval.py python examples/offline_inference/pooling/ner.py ``` +## Prithvi Geospatial MAE usage + +```bash +python examples/offline_inference/pooling/prithvi_geospatial_mae.py +``` + +## IO Processor Plugins for Prithvi Geospatial MAE + +```bash +python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py +``` + ## Qwen3 reranker usage ```bash diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py index b2dffdd6c5..34c80e7ccf 100644 --- a/examples/offline_inference/pooling/ner.py +++ b/examples/offline_inference/pooling/ner.py @@ -33,7 +33,7 @@ def main(args: Namespace): label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label # Run inference - outputs = llm.encode(prompts) + outputs = llm.encode(prompts, pooling_task="token_classify") for prompt, output in zip(prompts, outputs): logits = output.outputs.data diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/pooling/prithvi_geospatial_mae.py similarity index 100% rename from examples/offline_inference/prithvi_geospatial_mae.py rename to examples/offline_inference/pooling/prithvi_geospatial_mae.py diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py similarity index 100% rename from examples/offline_inference/prithvi_geospatial_mae_io_processor.py rename to examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py index 295d163752..eb7ed969ea 100644 --- a/examples/offline_inference/torchrun_dp_example.py +++ b/examples/offline_inference/torchrun_dp_example.py @@ -9,10 +9,76 @@ To run this example: ```bash $ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py ``` + +With custom parallelism settings: +```bash +$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \ + --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep +``` """ +import argparse + from vllm import LLM, SamplingParams + +def parse_args(): + parser = argparse.ArgumentParser( + description="Data-parallel inference with torchrun" + ) + parser.add_argument( + "--tp-size", + type=int, + default=1, + help="Tensor parallel size (default: 1)", + ) + parser.add_argument( + "--pp-size", + type=int, + default=1, + help="Pipeline parallel size (default: 1)", + ) + parser.add_argument( + "--dp-size", + type=int, + default=2, + help="Data parallel size (default: 2)", + ) + parser.add_argument( + "--enable-ep", + action="store_true", + help="Enable expert parallel (default: False)", + ) + parser.add_argument( + "--model", + type=str, + default="microsoft/Phi-mini-MoE-instruct", + help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)", + ) + parser.add_argument( + "--max-model-len", + type=int, + default=4096, + help="Maximum model length (default: 4096)", + ) + parser.add_argument( + "--gpu-memory-utilization", + type=float, + default=0.6, + help="GPU memory utilization (default: 0.6)", + ) + parser.add_argument( + "--seed", + type=int, + default=1, + help="Random seed (default: 1)", + ) + return parser.parse_args() + + +args = parse_args() + + # Create prompts, the same across all ranks prompts = [ "Hello, my name is", @@ -30,15 +96,15 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # all ranks have the same random seed, so that sampling can be # deterministic across ranks. llm = LLM( - model="microsoft/Phi-mini-MoE-instruct", - tensor_parallel_size=1, - data_parallel_size=2, - pipeline_parallel_size=1, - enable_expert_parallel=False, + model=args.model, + tensor_parallel_size=args.tp_size, + data_parallel_size=args.dp_size, + pipeline_parallel_size=args.pp_size, + enable_expert_parallel=args.enable_ep, distributed_executor_backend="external_launcher", - max_model_len=4096, - gpu_memory_utilization=0.6, - seed=1, + max_model_len=args.max_model_len, + gpu_memory_utilization=args.gpu_memory_utilization, + seed=args.seed, ) dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index 3b6da20d5f..b76ad21f04 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -3,65 +3,95 @@ ## Cohere rerank usage ```bash +# vllm serve BAAI/bge-reranker-base python examples/online_serving/pooling/cohere_rerank_client.py ``` ## Embedding requests base64 encoding_format usage ```bash +# vllm serve intfloat/e5-small python examples/online_serving/pooling/embedding_requests_base64_client.py ``` ## Embedding requests bytes encoding_format usage ```bash +# vllm serve intfloat/e5-small python examples/online_serving/pooling/embedding_requests_bytes_client.py ``` ## Jinaai rerank usage ```bash +# vllm serve BAAI/bge-reranker-base python examples/online_serving/pooling/jinaai_rerank_client.py ``` ## Multi vector retrieval usage ```bash +# vllm serve BAAI/bge-m3 python examples/online_serving/pooling/multi_vector_retrieval_client.py ``` ## Named Entity Recognition (NER) usage ```bash +# vllm serve boltuix/NeuroBERT-NER python examples/online_serving/pooling/ner_client.py ``` -## Openai chat embedding for multimodal usage +## OpenAI chat embedding for multimodal usage ```bash python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py ``` -## Openai classification usage +## OpenAI classification usage ```bash +# vllm serve jason9693/Qwen2.5-1.5B-apeach python examples/online_serving/pooling/openai_classification_client.py ``` -## Openai embedding usage +## OpenAI cross_encoder score usage ```bash +# vllm serve BAAI/bge-reranker-v2-m3 +python examples/online_serving/pooling/openai_cross_encoder_score.py +``` + +## OpenAI cross_encoder score for multimodal usage + +```bash +# vllm serve jinaai/jina-reranker-m0 +python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py +``` + +## OpenAI embedding usage + +```bash +# vllm serve intfloat/e5-small python examples/online_serving/pooling/openai_embedding_client.py ``` -## Openai embedding matryoshka dimensions usage +## OpenAI embedding matryoshka dimensions usage ```bash +# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py ``` -## Openai pooling usage +## OpenAI pooling usage ```bash +# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code python examples/online_serving/pooling/openai_pooling_client.py ``` + +## Online Prithvi Geospatial MAE usage + +```bash +python examples/online_serving/pooling/prithvi_geospatial_mae.py +``` diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/pooling/openai_cross_encoder_score.py similarity index 100% rename from examples/online_serving/openai_cross_encoder_score.py rename to examples/online_serving/pooling/openai_cross_encoder_score.py diff --git a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py b/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py similarity index 100% rename from examples/online_serving/openai_cross_encoder_score_for_multimodal.py rename to examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/pooling/prithvi_geospatial_mae.py similarity index 100% rename from examples/online_serving/prithvi_geospatial_mae.py rename to examples/online_serving/pooling/prithvi_geospatial_mae.py diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 7c5bc457d4..dd45eb832a 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -13,5 +13,3 @@ torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytor # xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8 # FlashInfer should be updated together with the Dockerfile flashinfer-python==0.4.1 -# Triton Kernels are needed for mxfp4 fused moe. (Should be updated alongside torch) -triton_kernels @ git+https://github.com/triton-lang/triton.git@v3.5.0#subdirectory=python/triton_kernels diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index dea1926bbd..63c1908f02 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -42,6 +42,6 @@ tritonclient==2.51.0 numba == 0.61.2 # Required for N-gram speculative decoding numpy -runai-model-streamer[s3,gcs]==0.14.0 +runai-model-streamer[s3,gcs]==0.15.0 fastsafetensors>=0.1.10 pydantic>=2.12 # 2.11 leads to error on python 3.13 diff --git a/requirements/rocm.txt b/requirements/rocm.txt index d9743f0446..6f1cca90e5 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -12,6 +12,6 @@ tensorizer==2.10.1 packaging>=24.2 setuptools>=77.0.3,<80.0.0 setuptools-scm>=8 -runai-model-streamer[s3,gcs]==0.14.0 +runai-model-streamer[s3,gcs]==0.15.0 conch-triton-kernels==1.2.1 timm>=1.0.17 diff --git a/requirements/test.in b/requirements/test.in index a79ec839db..b1ab599ff1 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -50,7 +50,7 @@ tritonclient==2.51.0 numba == 0.61.2 # Required for N-gram speculative decoding numpy -runai-model-streamer[s3,gcs]==0.14.0 +runai-model-streamer[s3,gcs]==0.15.0 fastsafetensors>=0.1.10 pydantic>=2.12 # 2.11 leads to error on python 3.13 decord==0.6.0 diff --git a/requirements/test.txt b/requirements/test.txt index bc007ccf10..e54bb49fde 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -965,11 +965,11 @@ rsa==4.9.1 # via google-auth rtree==1.4.0 # via torchgeo -runai-model-streamer==0.14.0 +runai-model-streamer==0.15.0 # via -r requirements/test.in -runai-model-streamer-gcs==0.14.0 +runai-model-streamer-gcs==0.15.0 # via runai-model-streamer -runai-model-streamer-s3==0.14.0 +runai-model-streamer-s3==0.15.0 # via runai-model-streamer s3transfer==0.10.3 # via boto3 diff --git a/requirements/xpu.txt b/requirements/xpu.txt index d14b631aa9..e69a98b860 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -15,4 +15,4 @@ torchaudio torchvision --extra-index-url=https://download.pytorch.org/whl/xpu -intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl +intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl diff --git a/setup.py b/setup.py index 990fe4cde3..8139d0d62b 100644 --- a/setup.py +++ b/setup.py @@ -709,10 +709,10 @@ setup( ext_modules=ext_modules, install_requires=get_requirements(), extras_require={ - "bench": ["pandas", "datasets"], + "bench": ["pandas", "matplotlib", "seaborn", "datasets"], "tensorizer": ["tensorizer==2.10.1"], "fastsafetensors": ["fastsafetensors >= 0.1.10"], - "runai": ["runai-model-streamer[s3,gcs] >= 0.14.0"], + "runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"], "audio": [ "librosa", "soundfile", diff --git a/tests/benchmarks/test_random_dataset.py b/tests/benchmarks/test_random_dataset.py index 68e4afdcbe..57f6893061 100644 --- a/tests/benchmarks/test_random_dataset.py +++ b/tests/benchmarks/test_random_dataset.py @@ -359,3 +359,126 @@ def test_random_mm_bucket_config_not_mutated( assert len(mm_data) >= 1 for it in mm_data: assert it.get("type") == "image_url" + + +@pytest.mark.benchmark +def test_random_mm_video_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None: + """Test video sampling functionality in RandomMultiModalDataset.""" + ds = RandomMultiModalDataset(random_seed=42) + + # Test with video bucket configuration + bucket_config = { + (64, 64, 1): 0.3, # Images + (64, 64, 8): 0.7, # Videos + } + + limit_mm_per_prompt = {"image": 2, "video": 2} + + samples = _collect_mm_samples( + ds, + hf_tokenizer, + num_requests=5, + base_items_per_request=1, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + ) + + assert len(samples) == 5 + + # Check that we have both images and videos + video_count = 0 + image_count = 0 + + for s in samples: + mm_data = cast(list[dict[str, Any]], s.multi_modal_data) + assert len(mm_data) == 1 + + item = mm_data[0] + if item.get("type") == "video_url": + video_count += 1 + # Verify video URL format + url = item.get("video_url", {}).get("url", "") + assert url.startswith("data:video/mp4;base64,") + elif item.get("type") == "image_url": + image_count += 1 + # Verify image URL format + url = item.get("image_url", {}).get("url", "") + assert url.startswith("data:image/jpeg;base64,") + + # Should have some videos due to 0.7 probability + assert video_count > 0 + assert image_count > 0 + + +@pytest.mark.benchmark +def test_random_mm_video_only_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None: + """Test sampling with only video buckets.""" + ds = RandomMultiModalDataset(random_seed=42) + + bucket_config = { + (64, 64, 8): 1.0, # Only videos + } + + limit_mm_per_prompt = {"image": 0, "video": 1} + + samples = _collect_mm_samples( + ds, + hf_tokenizer, + num_requests=3, + base_items_per_request=1, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + ) + + assert len(samples) == 3 + + for s in samples: + mm_data = cast(list[dict[str, Any]], s.multi_modal_data) + assert len(mm_data) == 1 + + item = mm_data[0] + assert item.get("type") == "video_url" + url = item.get("video_url", {}).get("url", "") + assert url.startswith("data:video/mp4;base64,") + + +@pytest.mark.benchmark +def test_random_mm_video_deterministic_sampling( + hf_tokenizer: PreTrainedTokenizerBase, +) -> None: + """Test that video sampling is deterministic with same seed.""" + seed = 123 + ds_a = RandomMultiModalDataset(random_seed=seed) + ds_b = RandomMultiModalDataset(random_seed=seed) + + bucket_config = { + (64, 64, 8): 1.0, # Only videos + } + + limit_mm_per_prompt = {"image": 0, "video": 1} + + a = _collect_mm_samples( + ds_a, + hf_tokenizer, + num_requests=3, + base_items_per_request=1, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + ) + + b = _collect_mm_samples( + ds_b, + hf_tokenizer, + num_requests=3, + base_items_per_request=1, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + ) + + fa = [_mm_fingerprint_sample(s) for s in a] + fb = [_mm_fingerprint_sample(s) for s in b] + assert fa == fb diff --git a/tests/benchmarks/test_random_multimodal_dataset_video.py b/tests/benchmarks/test_random_multimodal_dataset_video.py new file mode 100644 index 0000000000..db19a169e3 --- /dev/null +++ b/tests/benchmarks/test_random_multimodal_dataset_video.py @@ -0,0 +1,398 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import base64 +import os +from tempfile import NamedTemporaryFile +from typing import Any, cast + +import cv2 +import pytest +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from vllm.benchmarks.datasets import RandomMultiModalDataset, SampleRequest + + +@pytest.fixture(scope="session") +def hf_tokenizer() -> PreTrainedTokenizerBase: + """Use a small, commonly available tokenizer.""" + return AutoTokenizer.from_pretrained("gpt2") + + +@pytest.fixture +def video_dataset() -> RandomMultiModalDataset: + """Create a RandomMultiModalDataset instance for testing.""" + return RandomMultiModalDataset(random_seed=42) + + +@pytest.mark.benchmark +def test_generate_synthetic_video_different_seeds(): + """Test that different seeds produce different videos.""" + dataset1 = RandomMultiModalDataset(random_seed=123) + dataset2 = RandomMultiModalDataset(random_seed=456) + + width, height, num_frames = 64, 48, 8 + + video1 = dataset1.generate_synthetic_video(width, height, num_frames) + video2 = dataset2.generate_synthetic_video(width, height, num_frames) + + # Videos should be different due to different seeds + assert video1["bytes"] != video2["bytes"] + + +@pytest.mark.benchmark +def test_map_config_to_modality(video_dataset: RandomMultiModalDataset): + """Test modality mapping for different configurations.""" + # Test image configuration (num_frames = 1) + assert video_dataset.map_config_to_modality((256, 256, 1)) == "image" + assert video_dataset.map_config_to_modality((720, 1280, 1)) == "image" + + # Test video configurations (num_frames > 1) + assert video_dataset.map_config_to_modality((256, 256, 8)) == "video" + assert video_dataset.map_config_to_modality((720, 1280, 16)) == "video" + assert video_dataset.map_config_to_modality((64, 64, 32)) == "video" + + # Test invalid configurations + with pytest.raises(ValueError, match="Invalid multimodal item configuration"): + video_dataset.map_config_to_modality((256, 256, 0)) + + with pytest.raises(ValueError, match="Invalid multimodal item configuration"): + video_dataset.map_config_to_modality((256, 256, -1)) + + +@pytest.mark.benchmark +def test_generate_mm_item_video(video_dataset: RandomMultiModalDataset): + """Test generating multimodal items for video configurations.""" + # Test video item generation + video_config = (64, 48, 8) # height, width, num_frames + result = video_dataset.generate_mm_item(video_config) + + # Check the result structure matches OpenAI API format + assert isinstance(result, dict) + assert result["type"] == "video_url" + assert "video_url" in result + assert "url" in result["video_url"] + + # Check that the URL is a data URL with base64 encoded video + url = result["video_url"]["url"] + assert url.startswith("data:video/mp4;base64,") + + # Decode and verify the video content + base64_data = url.split(",")[1] + video_bytes = base64.b64decode(base64_data) + assert len(video_bytes) > 0 + + # Verify the video can be decoded + with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file: + temp_path = temp_file.name + temp_file.write(video_bytes) + + try: + cap = cv2.VideoCapture(temp_path) + assert cap.isOpened() + + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + assert frame_count == 8 + assert frame_width == 48 + assert frame_height == 64 + + cap.release() + finally: + if os.path.exists(temp_path): + os.unlink(temp_path) + + +@pytest.mark.benchmark +def test_generate_mm_item_image(video_dataset: RandomMultiModalDataset): + """Test generating multimodal items for image configurations.""" + # Test image item generation + image_config = (64, 48, 1) # height, width, num_frames=1 + result = video_dataset.generate_mm_item(image_config) + + # Check the result structure matches OpenAI API format + assert isinstance(result, dict) + assert result["type"] == "image_url" + assert "image_url" in result + assert "url" in result["image_url"] + + # Check that the URL is a data URL with base64 encoded image + url = result["image_url"]["url"] + assert url.startswith("data:image/jpeg;base64,") + + +@pytest.mark.benchmark +def test_generate_mm_item_invalid_config(video_dataset: RandomMultiModalDataset): + """Test error handling for invalid configurations.""" + with pytest.raises(ValueError, match="Invalid multimodal item configuration"): + video_dataset.generate_mm_item((256, 256, 0)) + + +@pytest.mark.benchmark +def test_sample_with_video_buckets( + video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase +): + """Test sampling with video bucket configurations.""" + # Configure bucket with video probability > 0 + bucket_config = { + (64, 64, 1): 0.3, # Images + (64, 64, 8): 0.7, # Videos + } + + limit_mm_per_prompt = {"image": 5, "video": 3} + + samples = video_dataset.sample( + tokenizer=hf_tokenizer, + num_requests=5, + base_items_per_request=2, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + input_len=20, + output_len=5, + ) + + assert len(samples) == 5 + + # Check that samples contain both images and videos + video_count = 0 + image_count = 0 + + for sample in samples: + assert isinstance(sample, SampleRequest) + assert sample.multi_modal_data is not None + assert isinstance(sample.multi_modal_data, list) + + mm_data = cast(list[dict[str, Any]], sample.multi_modal_data) + assert len(mm_data) == 2 # base_items_per_request + + for item in mm_data: + if item["type"] == "video_url": + video_count += 1 + # Verify video URL format + url = item["video_url"]["url"] + assert url.startswith("data:video/mp4;base64,") + elif item["type"] == "image_url": + image_count += 1 + # Verify image URL format + url = item["image_url"]["url"] + assert url.startswith("data:image/jpeg;base64,") + + # Should have some videos due to 0.7 probability + assert video_count > 0 + assert image_count > 0 + + +@pytest.mark.benchmark +def test_sample_video_only_buckets( + video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase +): + """Test sampling with only video buckets.""" + bucket_config = { + (64, 64, 8): 1.0, # Only videos + } + + limit_mm_per_prompt = {"image": 0, "video": 2} + + samples = video_dataset.sample( + tokenizer=hf_tokenizer, + num_requests=3, + base_items_per_request=1, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + input_len=20, + output_len=5, + ) + + assert len(samples) == 3 + + for sample in samples: + assert isinstance(sample, SampleRequest) + assert sample.multi_modal_data is not None + assert isinstance(sample.multi_modal_data, list) + + mm_data = cast(list[dict[str, Any]], sample.multi_modal_data) + assert len(mm_data) == 1 + + item = mm_data[0] + assert item["type"] == "video_url" + url = item["video_url"]["url"] + assert url.startswith("data:video/mp4;base64,") + + +@pytest.mark.benchmark +def test_sample_respects_video_limits( + video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase +): + """Test that sampling respects video limits per prompt.""" + bucket_config = { + (64, 64, 8): 1.0, # Only videos + } + + # Set very low video limit + limit_mm_per_prompt = {"image": 0, "video": 1} + + samples = video_dataset.sample( + tokenizer=hf_tokenizer, + num_requests=3, + base_items_per_request=1, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + input_len=20, + output_len=5, + ) + + assert len(samples) == 3 + + for sample in samples: + mm_data = cast(list[dict[str, Any]], sample.multi_modal_data) + assert len(mm_data) <= 1 # Should respect video limit + + +@pytest.mark.benchmark +def test_sample_mixed_buckets_with_zero_probability( + video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase +): + """Test sampling with mixed buckets including zero probability entries.""" + bucket_config = { + (64, 64, 1): 0.5, # Images + (64, 64, 8): 0.5, # Videos + (128, 128, 16): 0.0, # Zero probability videos (should be ignored) + } + + limit_mm_per_prompt = {"image": 2, "video": 2} + + samples = video_dataset.sample( + tokenizer=hf_tokenizer, + num_requests=4, + base_items_per_request=2, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + input_len=20, + output_len=5, + ) + + assert len(samples) == 4 + + # Should only see 64x64 videos, not 128x128 videos + for sample in samples: + mm_data = cast(list[dict[str, Any]], sample.multi_modal_data) + for item in mm_data: + if item["type"] == "video_url": + # Decode video to verify dimensions + url = item["video_url"]["url"] + base64_data = url.split(",")[1] + video_bytes = base64.b64decode(base64_data) + + with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file: # noqa + temp_path = temp_file.name + temp_file.write(video_bytes) + + try: + cap = cv2.VideoCapture(temp_path) + frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + cap.release() + + # Should be 64x64, not 128x128 + assert frame_width == 64 + assert frame_height == 64 + finally: + if os.path.exists(temp_path): + os.unlink(temp_path) + + +@pytest.mark.benchmark +def test_sample_deterministic_with_videos(hf_tokenizer: PreTrainedTokenizerBase): + """Test that sampling with videos is deterministic with same seed.""" + dataset1 = RandomMultiModalDataset(random_seed=123) + dataset2 = RandomMultiModalDataset(random_seed=123) + + bucket_config = { + (64, 64, 1): 0.3, # Images + (64, 64, 8): 0.7, # Videos + } + + limit_mm_per_prompt = {"image": 2, "video": 2} + + samples1 = dataset1.sample( + tokenizer=hf_tokenizer, + num_requests=3, + base_items_per_request=1, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + input_len=20, + output_len=5, + ) + + samples2 = dataset2.sample( + tokenizer=hf_tokenizer, + num_requests=3, + base_items_per_request=1, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + input_len=20, + output_len=5, + ) + + assert len(samples1) == len(samples2) + + # Compare multimodal data + for s1, s2 in zip(samples1, samples2): + assert s1.multi_modal_data == s2.multi_modal_data + + +@pytest.mark.benchmark +def test_sample_different_seeds_produce_different_videos( + hf_tokenizer: PreTrainedTokenizerBase, +): + """Test that different seeds produce different video content.""" + dataset1 = RandomMultiModalDataset(random_seed=123) + dataset2 = RandomMultiModalDataset(random_seed=456) + + bucket_config = { + (64, 64, 8): 1.0, # Only videos + } + + limit_mm_per_prompt = {"image": 0, "video": 1} + + samples1 = dataset1.sample( + tokenizer=hf_tokenizer, + num_requests=2, + base_items_per_request=1, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + input_len=20, + output_len=5, + ) + + samples2 = dataset2.sample( + tokenizer=hf_tokenizer, + num_requests=2, + base_items_per_request=1, + num_mm_items_range_ratio=0.0, + limit_mm_per_prompt=limit_mm_per_prompt, + bucket_config=bucket_config, + input_len=20, + output_len=5, + ) + + # Video content should be different + for s1, s2 in zip(samples1, samples2): + mm_data1 = cast(list[dict[str, Any]], s1.multi_modal_data) + mm_data2 = cast(list[dict[str, Any]], s2.multi_modal_data) + + assert len(mm_data1) == len(mm_data2) == 1 + + url1 = mm_data1[0]["video_url"]["url"] + url2 = mm_data2[0]["video_url"]["url"] + + assert url1 != url2 # Different video content diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/test_multimodal_compile.py new file mode 100644 index 0000000000..6c195dd93f --- /dev/null +++ b/tests/compile/test_multimodal_compile.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from vllm.compilation.counter import compilation_counter +from vllm.config.compilation import CompilationMode + + +# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 +@pytest.mark.forked +def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch): + """Test that Qwen2.5-VL vision submodules are compiled. + + This test verifies that the 3 vision submodules (Qwen2_5_VisionPatchEmbed, + Qwen2_5_VisionBlock, and Qwen2_5_VisionPatchMerger) are properly tagged + for compilation by checking that num_models_seen increases by at least 3. + """ + # Disable multiprocessing so that the counter is in the same process + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + + with ( + # NOTE: Qwen2.5-VL has 35 models in total - the LLM backend + # Vision Patch Embed, Vision Patch Merger, and then 32 Vision Blocks + # (one for each layer) - in the future, we should fix vLLM compilation + # logic to handle this case and only compile the Vision submodules once + # and reuse the compiled code for all layers + # See https://github.com/vllm-project/vllm/issues/27590 + compilation_counter.expect(num_models_seen=35), + vllm_runner( + "Qwen/Qwen2.5-VL-3B-Instruct", + max_model_len=2048, + gpu_memory_utilization=0.7, + compilation_config={"mode": CompilationMode.VLLM_COMPILE}, + ) as _, + ): + pass diff --git a/tests/conftest.py b/tests/conftest.py index ec0179b9cd..91155a72b1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -831,8 +831,9 @@ class VllmRunner: images: PromptImageInput | None = None, videos: PromptVideoInput | None = None, audios: PromptAudioInput | None = None, + return_logprobs: bool = False, **kwargs: Any, - ) -> list[tuple[list[list[int]], list[str]]]: + ) -> list[tuple[list[list[int]], list[str]]] | tuple[list, list]: inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) req_outputs = self.llm.generate( @@ -840,18 +841,23 @@ class VllmRunner: ) outputs: list[tuple[list[list[int]], list[str]]] = [] + logprobs = [] for req_output in req_outputs: prompt_str = req_output.prompt prompt_ids = req_output.prompt_token_ids req_sample_output_ids: list[list[int]] = [] req_sample_output_strs: list[str] = [] + req_logprobs = [] for sample in req_output.outputs: output_str = sample.text output_ids = list(sample.token_ids) req_sample_output_ids.append(prompt_ids + output_ids) req_sample_output_strs.append((prompt_str or "") + output_str) + if sample.logprobs: + req_logprobs.extend(sample.logprobs) outputs.append((req_sample_output_ids, req_sample_output_strs)) - return outputs + logprobs.append(req_logprobs) + return outputs if not return_logprobs else (outputs, logprobs) @staticmethod def _final_steps_generate_w_logprobs( diff --git a/tests/entrypoints/openai/test_response_api_mcp_tools.py b/tests/entrypoints/openai/test_response_api_mcp_tools.py index 653d44f20b..0dc2430cae 100644 --- a/tests/entrypoints/openai/test_response_api_mcp_tools.py +++ b/tests/entrypoints/openai/test_response_api_mcp_tools.py @@ -26,6 +26,8 @@ def mcp_disabled_server(monkeypatch_module: pytest.MonkeyPatch): with monkeypatch_module.context() as m: m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1") m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv") + # Helps the model follow instructions better + m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1") with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @@ -37,7 +39,9 @@ def mcp_enabled_server(monkeypatch_module: pytest.MonkeyPatch): with monkeypatch_module.context() as m: m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1") m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv") - m.setenv("GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,container") + m.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,container") + # Helps the model follow instructions better + m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1") with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @@ -56,18 +60,15 @@ async def mcp_enabled_client(mcp_enabled_server): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.") async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, model_name: str): response = await mcp_enabled_client.responses.create( model=model_name, - # TODO: Ideally should be able to set max tool calls - # to prevent multi-turn, but it is not currently supported - # would speed up the test input=( - "What's the first 4 digits after the decimal point of " - "cube root of `19910212 * 20250910`? " - "Show only the digits. The python interpreter is not stateful " - "and you must print to see the output." + "Execute the following code: " + "import random; print(random.randint(1, 1000000))" + ), + instructions=( + "You must use the Python tool to execute code. Never simulate execution." ), tools=[ { @@ -77,26 +78,47 @@ async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, model_name: "server_url": "http://localhost:8888", } ], + extra_body={"enable_response_messages": True}, ) assert response is not None assert response.status == "completed" - assert response.usage.output_tokens_details.tool_output_tokens > 0 + # Verify output messages: Tool calls and responses on analysis channel + tool_call_found = False + tool_response_found = False + for message in response.output_messages: + recipient = message.get("recipient") + if recipient and recipient.startswith("python"): + tool_call_found = True + assert message.get("channel") == "analysis", ( + "Tool call should be on analysis channel" + ) + author = message.get("author", {}) + if ( + author.get("role") == "tool" + and author.get("name") + and author.get("name").startswith("python") + ): + tool_response_found = True + assert message.get("channel") == "analysis", ( + "Tool response should be on analysis channel" + ) + + assert tool_call_found, "Should have found at least one Python tool call" + assert tool_response_found, "Should have found at least one Python tool response" + for message in response.input_messages: + assert message.get("author").get("role") != "developer", ( + "No developer messages should be present with valid mcp tool" + ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.") async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_name: str): response = await mcp_disabled_client.responses.create( model=model_name, - # TODO: Ideally should be able to set max tool calls - # to prevent multi-turn, but it is not currently supported - # would speed up the test input=( - "What's the first 4 digits after the decimal point of " - "cube root of `19910212 * 20250910`? " - "Show only the digits. The python interpreter is not stateful " - "and you must print to see the output." + "Execute the following code if the tool is present: " + "import random; print(random.randint(1, 1000000))" ), tools=[ { @@ -106,7 +128,34 @@ async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_nam "server_url": "http://localhost:8888", } ], + extra_body={"enable_response_messages": True}, ) assert response is not None assert response.status == "completed" - assert response.usage.output_tokens_details.tool_output_tokens == 0 + # Verify output messages: No tool calls and responses + tool_call_found = False + tool_response_found = False + for message in response.output_messages: + recipient = message.get("recipient") + if recipient and recipient.startswith("python"): + tool_call_found = True + assert message.get("channel") == "analysis", ( + "Tool call should be on analysis channel" + ) + author = message.get("author", {}) + if ( + author.get("role") == "tool" + and author.get("name") + and author.get("name").startswith("python") + ): + tool_response_found = True + assert message.get("channel") == "analysis", ( + "Tool response should be on analysis channel" + ) + + assert not tool_call_found, "Should not have a python call" + assert not tool_response_found, "Should not have a tool response" + for message in response.input_messages: + assert message.get("author").get("role") != "developer", ( + "No developer messages should be present without a valid tool" + ) diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py index cf21a5116d..788a1e9121 100644 --- a/tests/entrypoints/openai/test_serving_responses.py +++ b/tests/entrypoints/openai/test_serving_responses.py @@ -6,10 +6,19 @@ from unittest.mock import MagicMock import pytest import pytest_asyncio +from openai.types.responses.tool import ( + CodeInterpreterContainerCodeInterpreterToolAuto, + LocalShell, + Mcp, + Tool, +) from vllm.entrypoints.context import ConversationContext from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest -from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses +from vllm.entrypoints.openai.serving_responses import ( + OpenAIServingResponses, + extract_tool_types, +) from vllm.entrypoints.tool_server import ToolServer from vllm.inputs.data import TokensPrompt as EngineTokensPrompt @@ -62,6 +71,45 @@ def mock_exit_stack(): return MagicMock(spec=AsyncExitStack) +def test_extract_tool_types(monkeypatch: pytest.MonkeyPatch) -> None: + tools: list[Tool] = [] + assert extract_tool_types(tools) == set() + + tools.append(LocalShell(type="local_shell")) + assert extract_tool_types(tools) == {"local_shell"} + + tools.append(CodeInterpreterContainerCodeInterpreterToolAuto(type="auto")) + assert extract_tool_types(tools) == {"local_shell", "auto"} + + tools.extend( + [ + Mcp(type="mcp", server_label="random", server_url=""), + Mcp(type="mcp", server_label="container", server_url=""), + Mcp(type="mcp", server_label="code_interpreter", server_url=""), + Mcp(type="mcp", server_label="web_search_preview", server_url=""), + ] + ) + # When envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS is not set, + # mcp tool types are all ignored. + assert extract_tool_types(tools) == {"local_shell", "auto"} + + # container is allowed, it would be extracted + monkeypatch.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "container") + assert extract_tool_types(tools) == {"local_shell", "auto", "container"} + + # code_interpreter and web_search_preview are allowed, + # they would be extracted + monkeypatch.setenv( + "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,web_search_preview" + ) + assert extract_tool_types(tools) == { + "local_shell", + "auto", + "code_interpreter", + "web_search_preview", + } + + class TestInitializeToolSessions: """Test class for _initialize_tool_sessions method""" diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py index e07436f89d..5f94ac6da2 100644 --- a/tests/entrypoints/openai/test_sleep.py +++ b/tests/entrypoints/openai/test_sleep.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import requests +from prometheus_client.parser import text_string_to_metric_families from ...utils import RemoteOpenAIServer @@ -31,12 +32,28 @@ def test_sleep_mode(): assert response.status_code == 200 assert response.json().get("is_sleeping") is True + # check sleep metrics + response = requests.get(remote_server.url_for("metrics")) + assert response.status_code == 200 + awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response) + assert awake == 0 + assert weights_offloaded == 1 + assert discard_all == 0 + response = requests.post(remote_server.url_for("wake_up")) assert response.status_code == 200 response = requests.get(remote_server.url_for("is_sleeping")) assert response.status_code == 200 assert response.json().get("is_sleeping") is False + # check sleep metrics + response = requests.get(remote_server.url_for("metrics")) + assert response.status_code == 200 + awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response) + assert awake == 1 + assert weights_offloaded == 0 + assert discard_all == 0 + # test wake up with tags response = requests.post(remote_server.url_for("sleep"), params={"level": "1"}) assert response.status_code == 200 @@ -59,3 +76,35 @@ def test_sleep_mode(): response = requests.get(remote_server.url_for("is_sleeping")) assert response.status_code == 200 assert response.json().get("is_sleeping") is False + + # check sleep metrics + response = requests.get(remote_server.url_for("metrics")) + assert response.status_code == 200 + awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response) + assert awake == 1 + assert weights_offloaded == 0 + assert discard_all == 0 + + +def _get_sleep_metrics_from_api(response: requests.Response): + """Return (awake, weights_offloaded, discard_all)""" + + awake, weights_offloaded, discard_all = None, None, None + + for family in text_string_to_metric_families(response.text): + if family.name == "vllm:engine_sleep_state": + for sample in family.samples: + if sample.name == "vllm:engine_sleep_state": + for label_name, label_value in sample.labels.items(): + if label_value == "awake": + awake = sample.value + elif label_value == "weights_offloaded": + weights_offloaded = sample.value + elif label_value == "discard_all": + discard_all = sample.value + + assert awake is not None + assert weights_offloaded is not None + assert discard_all is not None + + return awake, weights_offloaded, discard_all diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py index 96f634ee0a..1063c3b6b7 100644 --- a/tests/entrypoints/pooling/llm/test_classify.py +++ b/tests/entrypoints/pooling/llm/test_classify.py @@ -37,15 +37,17 @@ def llm(): @pytest.mark.skip_global_cleanup def test_pooling_params(llm: LLM): - def get_outputs(activation): + def get_outputs(use_activation): outputs = llm.classify( - prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False + prompts, + pooling_params=PoolingParams(use_activation=use_activation), + use_tqdm=False, ) return torch.tensor([x.outputs.probs for x in outputs]) - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py index 81058dbad8..0255704cec 100644 --- a/tests/entrypoints/pooling/llm/test_reward.py +++ b/tests/entrypoints/pooling/llm/test_reward.py @@ -37,15 +37,17 @@ def llm(): def test_pooling_params(llm: LLM): - def get_outputs(activation): + def get_outputs(use_activation): outputs = llm.reward( - prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False + prompts, + pooling_params=PoolingParams(use_activation=use_activation), + use_tqdm=False, ) return torch.cat([x.outputs.data for x in outputs]) - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py index 2df973dd78..b69c6a47c1 100644 --- a/tests/entrypoints/pooling/llm/test_score.py +++ b/tests/entrypoints/pooling/llm/test_score.py @@ -34,21 +34,21 @@ def llm(): def test_pooling_params(llm: LLM): - def get_outputs(activation): + def get_outputs(use_activation): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." outputs = llm.score( text_1, text_2, - pooling_params=PoolingParams(activation=activation), + pooling_params=PoolingParams(use_activation=use_activation), use_tqdm=False, ) return torch.tensor([x.outputs.score for x in outputs]) - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py index 92d40efad2..671bb94878 100644 --- a/tests/entrypoints/pooling/openai/test_classification.py +++ b/tests/entrypoints/pooling/openai/test_classification.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F from tests.utils import RemoteOpenAIServer -from vllm.entrypoints.openai.protocol import ClassificationResponse +from vllm.entrypoints.openai.protocol import ClassificationResponse, PoolingResponse MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach" DTYPE = "float32" # Use float32 to avoid NaN issue @@ -163,20 +163,24 @@ async def test_invocations(server: RemoteOpenAIServer): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_activation(server: RemoteOpenAIServer, model_name: str): +async def test_use_activation(server: RemoteOpenAIServer, model_name: str): input_text = ["This product was excellent and exceeded my expectations"] - async def get_outputs(activation): + async def get_outputs(use_activation): response = requests.post( server.url_for("classify"), - json={"model": model_name, "input": input_text, "activation": activation}, + json={ + "model": model_name, + "input": input_text, + "use_activation": use_activation, + }, ) outputs = response.json() return torch.tensor([x["probs"] for x in outputs["data"]]) - default = await get_outputs(activation=None) - w_activation = await get_outputs(activation=True) - wo_activation = await get_outputs(activation=False) + default = await get_outputs(use_activation=None) + w_activation = await get_outputs(use_activation=True) + wo_activation = await get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." @@ -191,18 +195,7 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_pooling(server: RemoteOpenAIServer, model_name: str): - # pooling api uses ALL pooling, which does not support chunked prefill. - response = requests.post( - server.url_for("pooling"), - json={"model": model_name, "input": "test", "encoding_format": "float"}, - ) - assert response.json()["error"]["type"] == "BadRequestError" - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_score(server: RemoteOpenAIServer, model_name: str): +async def test_score(server: RemoteOpenAIServer, model_name: str): # score api is only enabled for num_labels == 1. response = requests.post( server.url_for("score"), @@ -217,7 +210,7 @@ def test_score(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_rerank(server: RemoteOpenAIServer, model_name: str): +async def test_rerank(server: RemoteOpenAIServer, model_name: str): # rerank api is only enabled for num_labels == 1. response = requests.post( server.url_for("rerank"), @@ -228,3 +221,62 @@ def test_rerank(server: RemoteOpenAIServer, model_name: str): }, ) assert response.json()["error"]["type"] == "BadRequestError" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str): + input_text = "This product was excellent and exceeded my expectations" + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": "classify", + }, + ) + poolings = PoolingResponse.model_validate(response.json()) + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 2 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str): + # token_classify uses ALL pooling, which does not support chunked prefill. + task = "token_classify" + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"]) +async def test_pooling_not_supported( + server: RemoteOpenAIServer, model_name: str, task: str +): + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index b3f12283fd..e971b23e8f 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -562,12 +562,40 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_pooling(server: RemoteOpenAIServer, model_name: str): +async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str): + task = "embed" input_text = ["The chef prepared a delicious meal."] response = requests.post( server.url_for("pooling"), - json={"model": model_name, "input": input_text, "encoding_format": "float"}, + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": task, + }, + ) + + poolings = PoolingResponse.model_validate(response.json()) + + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 384 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str): + task = "token_embed" + input_text = ["The chef prepared a delicious meal."] + + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": task, + }, ) poolings = PoolingResponse.model_validate(response.json()) @@ -575,3 +603,24 @@ async def test_pooling(server: RemoteOpenAIServer, model_name: str): assert len(poolings.data) == 1 assert len(poolings.data[0].data) == 11 assert len(poolings.data[0].data[0]) == 384 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"]) +async def test_pooling_not_supported( + server: RemoteOpenAIServer, model_name: str, task: str +): + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index e43148d25f..1d85190c12 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -125,8 +125,8 @@ def test_invocations(server: RemoteOpenAIServer): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_activation(server: RemoteOpenAIServer, model_name: str): - async def get_outputs(activation): +async def test_use_activation(server: RemoteOpenAIServer, model_name: str): + async def get_outputs(use_activation): query = "What is the capital of France?" documents = [ "The capital of Brazil is Brasilia.", @@ -139,16 +139,16 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str): "model": model_name, "query": query, "documents": documents, - "activation": activation, + "use_activation": use_activation, }, ) outputs = response.json() return torch.tensor([x["relevance_score"] for x in outputs["results"]]) - default = await get_outputs(activation=None) - w_activation = await get_outputs(activation=True) - wo_activation = await get_outputs(activation=False) + default = await get_outputs(use_activation=None) + w_activation = await get_outputs(use_activation=True) + wo_activation = await get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." @@ -163,7 +163,25 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_pooling(server: RemoteOpenAIServer, model_name: str): +async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str): + input_text = "This product was excellent and exceeded my expectations" + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": "classify", + }, + ) + poolings = PoolingResponse.model_validate(response.json()) + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str): input_text = ["The chef prepared a delicious meal."] response = requests.post( @@ -176,3 +194,24 @@ async def test_pooling(server: RemoteOpenAIServer, model_name: str): assert len(poolings.data) == 1 assert len(poolings.data[0].data) == 11 assert len(poolings.data[0].data[0]) == 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"]) +async def test_pooling_not_supported( + server: RemoteOpenAIServer, model_name: str, task: str +): + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py index ef213ab0ea..b8f796d47e 100644 --- a/tests/entrypoints/pooling/openai/test_score.py +++ b/tests/entrypoints/pooling/openai/test_score.py @@ -218,8 +218,8 @@ class TestModel: # TODO: reset this tolerance to 0.01 once we find # an alternative to flash_attn with bfloat16 - def test_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]): - def get_outputs(activation): + def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]): + def get_outputs(use_activation): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." response = requests.post( @@ -228,7 +228,7 @@ class TestModel: "model": model["name"], "text_1": text_1, "text_2": text_2, - "activation": activation, + "use_activation": use_activation, }, ) if response.status_code != 200: @@ -238,9 +238,9 @@ class TestModel: return torch.tensor([x["score"] for x in outputs["data"]]) if model["is_cross_encoder"]: - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." @@ -252,8 +252,8 @@ class TestModel: "w_activation should be close to activation(wo_activation)." ) else: - get_outputs(activation=None) + get_outputs(use_activation=None) # The activation parameter only works for the is_cross_encoder model - response = get_outputs(activation=True) + response = get_outputs(use_activation=True) assert response.status_code == 400 diff --git a/tests/entrypoints/test_harmony_utils.py b/tests/entrypoints/test_harmony_utils.py index 8d1764d411..6fa051a678 100644 --- a/tests/entrypoints/test_harmony_utils.py +++ b/tests/entrypoints/test_harmony_utils.py @@ -3,7 +3,10 @@ from openai_harmony import Role -from vllm.entrypoints.harmony_utils import parse_input_to_harmony_message +from vllm.entrypoints.harmony_utils import ( + has_custom_tools, + parse_input_to_harmony_message, +) class TestParseInputToHarmonyMessage: @@ -252,3 +255,12 @@ class TestParseInputToHarmonyMessage: assert len(messages[0].content) == 2 assert messages[0].content[0].text == "" assert messages[0].content[1].text == "actual text" + + +def test_has_custom_tools() -> None: + assert not has_custom_tools(set()) + assert not has_custom_tools({"web_search_preview", "code_interpreter", "container"}) + assert has_custom_tools({"others"}) + assert has_custom_tools( + {"web_search_preview", "code_interpreter", "container", "others"} + ) diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index c517e5c026..1d925dc1be 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -138,6 +138,7 @@ class Config: } backend = self.all2all_backend() + vllm_config.parallel_config.all2all_backend = backend if backend is not None: env_dict.update({"VLLM_ALL2ALL_BACKEND": backend}) diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 2dce099770..62704bbcbb 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -24,23 +24,16 @@ from vllm.triton_utils import tl MNK_FACTORS = [ (1, 128, 128), - (1, 128, 2048), (1, 512, 512), - (1, 1024, 128), (1, 1024, 2048), (32, 128, 128), (32, 512, 512), (32, 1024, 2048), - (45, 128, 128), (45, 128, 2048), - (45, 512, 512), (45, 1024, 128), - (45, 1024, 2048), (64, 512, 512), (64, 1024, 2048), - (222, 128, 128), (222, 128, 2048), - (222, 1024, 128), (222, 1024, 2048), ] NUM_EXPERTS = [8, 64] @@ -117,10 +110,19 @@ def test_batched_mm( block_shape: list[int] | None, per_act_token_quant: bool, ): + """Note: float8_e4m3fn is not supported on CUDA architecture < 89, + and those tests will be skipped on unsupported hardware.""" current_platform.seed_everything(7) use_fp8_w8a8 = dtype == torch.float8_e4m3fn + if (dtype == torch.float8_e4m3fn) and not current_platform.has_device_capability( + 89 + ): + pytest.skip( + "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" + ) + if (per_act_token_quant or block_shape is not None) and not use_fp8_w8a8: pytest.skip("Don't test blocking for non-quantized types.") @@ -244,10 +246,19 @@ def test_fused_moe_batched_experts( block_shape: list[int] | None, input_scales: bool, ): + """Note: float8_e4m3fn is not supported on CUDA architecture < 89, + and those tests will be skipped on unsupported hardware.""" current_platform.seed_everything(7) use_fp8_w8a8 = dtype == torch.float8_e4m3fn + if (dtype == torch.float8_e4m3fn) and not current_platform.has_device_capability( + 89 + ): + pytest.skip( + "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" + ) + if topk > e: pytest.skip("topk > e") diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 60f9f14b7f..cd34617ee0 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -42,57 +42,43 @@ DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] # and its hidden size is 7168. MNK_FACTORS = [ (1, 128, 128), - (1, 512, 512), (1, 128, 7168), (1, 1024, 7168), (1, 4608, 128), - (1, 4608, 512), (1, 4608, 7168), (83, 128, 128), (83, 512, 512), - (83, 1024, 7168), (83, 4608, 512), (83, 4608, 7168), - (128, 128, 128), (128, 512, 512), (128, 1024, 7168), - (128, 4608, 512), (128, 4608, 7168), (2048, 128, 128), (2048, 1024, 7168), (2048, 4608, 512), (2048, 4608, 7168), (8192, 128, 128), - (8192, 512, 512), (8192, 128, 7168), (8192, 1024, 7168), - (8192, 4608, 512), (8192, 4608, 7168), ] MNK_FACTORS_DG = [ (128, 128, 128), - (128, 512, 512), (128, 128, 7168), (128, 1024, 7168), (128, 4608, 128), - (128, 4608, 512), (128, 4608, 7168), - (192, 128, 128), (192, 512, 512), (192, 1024, 7168), - (192, 4608, 512), (192, 4608, 7168), (1335, 128, 128), (1335, 1024, 7168), (1335, 4608, 512), (1335, 4608, 7168), (2048, 128, 128), - (2048, 512, 512), (2048, 128, 7168), (2048, 1024, 7168), - (2048, 4608, 128), - (2048, 4608, 512), (2048, 4608, 7168), ] diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py index 74cc943714..3799e60f12 100644 --- a/tests/kernels/moe/test_block_int8.py +++ b/tests/kernels/moe/test_block_int8.py @@ -21,36 +21,28 @@ vllm_config = VllmConfig() vllm_config.scheduler_config.max_num_seqs = 128 vllm_config.scheduler_config.max_model_len = 8192 -DTYPES = [torch.half, torch.bfloat16] +DTYPES = [torch.bfloat16] MNK_FACTORS = [ (1, 128, 128), - (1, 512, 512), (1, 128, 7168), (1, 1024, 7168), - (1, 4096, 128), (1, 4096, 512), (1, 4096, 7168), - (33, 128, 128), (33, 512, 512), (33, 128, 7168), (33, 1024, 7168), (33, 4096, 128), - (33, 4096, 512), (33, 4096, 7168), (128, 128, 128), - (128, 512, 512), (128, 1024, 7168), (128, 4096, 512), (128, 4096, 7168), - (222, 128, 128), (222, 512, 512), (222, 1024, 7168), - (222, 4096, 512), (222, 4096, 7168), (2048, 128, 128), (2048, 1024, 7168), - (2048, 4096, 512), (2048, 4096, 4096), ] diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 4330eda251..5512ccce47 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -26,16 +26,13 @@ TOP_KS = [6, 8] MNK_FACTORS = [ (2, 1024, 1024), - (2, 1024, 1536), (2, 3072, 1024), (2, 3072, 1536), (7, 3072, 1536), (64, 1024, 1024), (64, 1024, 1536), (64, 3072, 1024), - (64, 3072, 1536), (224, 1024, 1024), - (224, 1024, 1536), (224, 3072, 1024), (224, 3072, 1536), (32768, 1024, 1024), diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index d46f453488..9d039b8169 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -393,7 +393,6 @@ def _test_deepep_deepgemm_moe( MNKs = [ (8, 128, 128), (8, 128, 512), - (8, 512, 512), (3, 1024, 2048), (32, 128, 1024), (45, 512, 2048), diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index cad0085d5b..9b1054f7d0 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -130,10 +130,8 @@ def run_single_case(m, n, k, topk, num_experts, block_size): # Note: N <= 512 will disable the deepgemm path due to performance issues. MNKs = [ (1024, 768, 128), - (1024, 768, 512), (2048, 768, 512), (512, 1024, 1024), - (512, 2048, 2048), (4096, 4096, 1024), ] diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 0780232a82..f985f9ac7c 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -34,8 +34,6 @@ TOP_KS = [1] MNK_FACTORS = [ (256, 8192, 5120), - (256, 4096, 5120), - (127, 8192, 5120), (127, 4096, 5120), (10, 8192, 5120), (10, 4096, 5120), diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py index 18cfd4f790..be3e36865d 100644 --- a/tests/kernels/moe/test_flashinfer_moe.py +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -34,10 +34,8 @@ if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_cap MNK_FACTORS = [ (2, 1024, 1024), - (2, 1024, 1536), (2, 3072, 1024), (2, 3072, 1536), - (64, 1024, 1024), (64, 1024, 1536), (64, 3072, 1024), (64, 2048, 1536), @@ -49,7 +47,7 @@ MNK_FACTORS = [ @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", [40, 64, 256]) @pytest.mark.parametrize("topk", [1, 6, 8]) -@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @torch.inference_mode() def test_flashinfer_fp4_moe_no_graph( m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py index 3f4f142be7..662e0723b7 100644 --- a/tests/kernels/moe/test_grouped_topk.py +++ b/tests/kernels/moe/test_grouped_topk.py @@ -27,7 +27,7 @@ from vllm.platforms import current_platform @pytest.mark.parametrize("topk_group", [2]) @pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"]) @pytest.mark.parametrize("routed_scaling_factor", [1.0, 2.5]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32]) def test_grouped_topk( monkeypatch: pytest.MonkeyPatch, n_token: int, diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index a46b0053e7..e3b8621b45 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -295,6 +295,8 @@ def test_modular_kernel_combinations_singlegpu( world_size: int, pytestconfig, ): + """Note: float8_e4m3fn is not supported on CUDA architecture < 89, + and those tests will be skipped on unsupported hardware.""" config = Config( Ms=Ms, K=k, @@ -309,6 +311,12 @@ def test_modular_kernel_combinations_singlegpu( world_size=world_size, ) + if ( + quant_config is not None and quant_config.quant_dtype == torch.float8_e4m3fn + ) and not current_platform.has_device_capability(89): + pytest.skip( + "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" + ) verbosity = pytestconfig.getoption("verbose") run(config, verbosity > 0) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 2c802ff4e6..014df1fa11 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -66,8 +66,6 @@ FUSED_MOE_MNK_FACTORS = [ (1, 128, 128), (1, 2048, 128), (33, 2048, 128), - (222, 1024, 1024), - (32768, 128, 128), (32768, 2048, 511), (40000, 1024, 1024), ] @@ -76,7 +74,6 @@ FUSED_MOE_WN16_MNK_FACTORS = [ (1, 128, 128), (1, 1024, 1024), (32, 2048, 128), - (32, 1024, 1024), (222, 2048, 1024), ] @@ -512,8 +509,8 @@ def marlin_moe_generate_valid_test_cases(): e_list = [4, 12] topk_list = [2, 3] ep_size_list = [1, 4] - dtype_list = [torch.half, torch.bfloat16] - group_size_list = [-1, 16, 32, 128] + dtype_list = [torch.bfloat16] + group_size_list = [-1, 32, 128] act_order_list = [True, False] quant_type_list = [ scalar_types.float4_e2m1f, @@ -885,10 +882,10 @@ def test_batched_moe_align_block_size_opcheck(): ) -@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("m", [1, 33, 222]) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("k", [128, 511, 1024]) -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype): input = torch.randn((m, topk, k), device="cuda", dtype=dtype) diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index dae19c0b2b..aa544fe0e0 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -26,9 +26,7 @@ MNK_FACTORS = [ (2, 1024, 1024), (2, 1024, 1536), (2, 3072, 1024), - (2, 3072, 1536), (64, 1024, 1024), - (64, 1024, 1536), (64, 3072, 1024), (64, 2048, 1536), (224, 1024, 1024), @@ -39,7 +37,7 @@ MNK_FACTORS = [ @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", [40, 64, 256]) @pytest.mark.parametrize("topk", [1, 6, 8]) -@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @torch.inference_mode() def test_cutlass_fp4_moe_no_graph( m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index 92e78ec239..97a55c37b9 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -19,20 +19,16 @@ CASES = [ (32, 64, 256, fp8_dtype), (17, 31, 768, fp8_dtype), (1, 1, 128 * 1, fp8_dtype), - (1, 1, 128 * 2, fp8_dtype), (1, 1, 128 * 3, fp8_dtype), (1, 1, 128 * 4, fp8_dtype), (8, 16, 128 * 1, fp8_dtype), (8, 16, 128 * 2, fp8_dtype), (8, 16, 128 * 3, fp8_dtype), - (8, 16, 128 * 4, fp8_dtype), (8, 64, 7168, fp8_dtype), (8, 128, 7168, fp8_dtype), - (8, 256, 7168, fp8_dtype), (8, 512, 7168, fp8_dtype), (8, 1024, 7168, fp8_dtype), (256, 8, 7168, fp8_dtype), - (256, 16, 7168, fp8_dtype), (256, 32, 7168, fp8_dtype), (256, 64, 7168, fp8_dtype), # Only add a few fnuz tests to help with long CI times. diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py index 1a5b9ba364..dfc45e78e4 100644 --- a/tests/lora/test_default_mm_loras.py +++ b/tests/lora/test_default_mm_loras.py @@ -30,7 +30,8 @@ VLLM_RUNNER_BASE_KWARGS = { "enable_lora": "True", "max_num_seqs": 2, "max_lora_rank": 320, - "max_model_len": 12800, + # Keep these LoRA tests on short-RoPE for determinism post-LongRoPE change. + "max_model_len": 4096, "gpu_memory_utilization": 0.8, "limit_mm_per_prompt": {"audio": 1}, "enforce_eager": True, diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py index 55663ee3f1..deb5de984d 100644 --- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py +++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py @@ -24,7 +24,7 @@ def test_classify_models_using_activation( model, max_model_len=512, dtype=dtype, - pooler_config=PoolerConfig(activation=False), + pooler_config=PoolerConfig(use_activation=False), ) as vllm_model: wo_activation_out = vllm_model.classify(example_prompts) @@ -32,7 +32,7 @@ def test_classify_models_using_activation( model, max_model_len=512, dtype=dtype, - pooler_config=PoolerConfig(activation=True), + pooler_config=PoolerConfig(use_activation=True), ) as vllm_model: w_activation_out = vllm_model.classify(example_prompts) @@ -104,7 +104,7 @@ def test_reward_models_using_activation( model, max_model_len=1024, dtype=dtype, - pooler_config=PoolerConfig(activation=False), + pooler_config=PoolerConfig(use_activation=False), ) as vllm_model: wo_activation = vllm_model.reward(example_prompts) @@ -112,7 +112,7 @@ def test_reward_models_using_activation( model, max_model_len=1024, dtype=dtype, - pooler_config=PoolerConfig(activation=True), + pooler_config=PoolerConfig(use_activation=True), ) as vllm_model: w_activation = vllm_model.reward(example_prompts) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index f11f75418e..4c79ac318f 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -159,6 +159,28 @@ VLM_TEST_SETTINGS = { image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), + "qwen3_vl": VLMTestInfo( + models=["Qwen/Qwen3-VL-4B-Instruct"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO, + ), + needs_video_metadata=True, + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + num_logprobs=20, + auto_cls=AutoModelForImageTextToText, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner, + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[ + pytest.mark.core_model, + ], + ), "ultravox": VLMTestInfo( models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"], test_type=VLMTestType.AUDIO, diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index 6252f33bdf..47852453c0 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -4,7 +4,9 @@ from collections.abc import Callable, Iterable from pathlib import PosixPath +from typing import Any +import numpy.typing as npt import torch from vllm.multimodal.audio import AudioResampler @@ -236,6 +238,7 @@ def build_video_inputs_from_test_info( video_assets: VideoTestAssets, size_wrapper: ImageSizeWrapper, num_frames: int, + needs_video_metadata: bool, ) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError("Prompt formatter must be set to build video inputs") @@ -248,7 +251,10 @@ def build_video_inputs_from_test_info( ) sampled_vids = [ - sample_frames_from_video(asset.np_ndarrays, num_frames) + sample_frames_with_video_metadata( + (asset.np_ndarrays, asset.metadata), + num_frames, + ) for asset in video_assets ] @@ -259,12 +265,33 @@ def build_video_inputs_from_test_info( return [ PromptWithMultiModalInput( prompts=[prompt for _ in size_wrapper.data], - video_data=[video_scaler(video, size) for size in size_wrapper.data], + video_data=[ + ( + video_scaler(video, size) + if not needs_video_metadata + else (video_scaler(video, size), meta) + ) + for size in size_wrapper.data + ], ) - for video, prompt in zip(sampled_vids, model_prompts) + for (video, meta), prompt in zip(sampled_vids, model_prompts) ] +def sample_frames_with_video_metadata( + video_with_meta: tuple[npt.NDArray, dict[str, Any]], + num_frames: int, +) -> tuple[npt.NDArray, dict[str, Any]]: + video, meta = video_with_meta + video = sample_frames_from_video(video, num_frames) + + meta["do_sample_frames"] = meta["total_num_frames"] == num_frames + meta["total_num_frames"] = num_frames + meta["fps"] = meta["duration"] / num_frames + meta["frames_indices"] = list(range(num_frames)) + return video, meta + + def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType): """Applies a size scaler to one image; this can be an image size factor, which scales the image while maintaining the aspect ratio""" diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py index 77e478e53c..d42150bcbf 100644 --- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py +++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py @@ -100,6 +100,9 @@ def get_parametrized_options( # num_frames is video only if test_type == VLMTestType.VIDEO: iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames) + iter_kwargs["needs_video_metadata"] = ensure_wrapped( + test_info.needs_video_metadata + ) # No sizes passed for custom inputs, since inputs are directly provided if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO): diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index 0685a01da5..87cd5c3cd3 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -905,6 +905,54 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner: return hf_model +def qwen3_vl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for GLM4.1V.""" + hf_processor = hf_model.processor + + def processor(*args, videos=None, **kwargs): + if videos is not None and is_list_of(videos, tuple): + # batched multi videos + do_sample_frames = {video[1]["do_sample_frames"] for video in videos} + assert len(do_sample_frames) == 1 + if kwargs.get("do_sample_frames") is None: + kwargs["do_sample_frames"] = do_sample_frames + video_metadata = [ + [ + VideoMetadata( + **{k: v for k, v in video[1].items() if k != "do_sample_frames"} + ) + ] + for video in videos + ] + videos = [[video[0]] for video in videos] + elif videos is not None and isinstance(videos, tuple): + # single video + do_sample_frames = videos[1]["do_sample_frames"] + if kwargs.get("do_sample_frames") is None: + kwargs["do_sample_frames"] = do_sample_frames + video_metadata = [ + [ + VideoMetadata( + **{ + k: v + for k, v in videos[1].items() + if k != "do_sample_frames" + } + ) + ] + ] + videos = [[videos[0]]] + else: + video_metadata = None + + return hf_processor( + *args, videos=videos, video_metadata=video_metadata, **kwargs + ) + + hf_model.processor = processor + return hf_model + + def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner: from vllm.model_executor.models.tarsier import get_vision_encoder_info diff --git a/tests/models/multimodal/generation/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py index c91ae117b5..218339ef1d 100644 --- a/tests/models/multimodal/generation/vlm_utils/runners.py +++ b/tests/models/multimodal/generation/vlm_utils/runners.py @@ -117,6 +117,7 @@ def run_video_test( video_assets, test_case.size_wrapper, test_case.num_video_frames, + test_case.needs_video_metadata, ) core.run_test( diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index fe02f71884..5c1bc6ac28 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -154,7 +154,8 @@ class VLMTestInfo(NamedTuple): dtype: str = "auto" distributed_executor_backend: str | None = None # Only expanded in video tests - num_video_frames: int = 16 + num_video_frames: int | tuple[int] = 16 + needs_video_metadata: bool = False # Fixed image sizes / image size factors; most tests use image_size_factors # The values provided for these two fields will be stacked and expanded @@ -212,5 +213,6 @@ class ExpandableVLMTestArgs(NamedTuple): size_wrapper: ImageSizeWrapper | None = None # Video only num_video_frames: int | None = None + needs_video_metadata: bool = False # Custom inputs only custom_test_opts: CustomTestOptions | None = None diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py index 5e0421af1c..2422097853 100644 --- a/tests/models/quantization/test_bitsandbytes.py +++ b/tests/models/quantization/test_bitsandbytes.py @@ -9,10 +9,16 @@ import pytest from transformers import BitsAndBytesConfig from tests.quantization.utils import is_quant_method_supported +from vllm.platforms import current_platform from ...utils import compare_two_settings, multi_gpu_test from ..utils import check_embeddings_close, check_logprobs_close +pytestmark = pytest.mark.skipif( + current_platform.is_rocm(), + reason="bitsandbytes quantization not supported on ROCm (CUDA-only kernels)", +) + models_4bit_to_test = [ ("facebook/opt-125m", "quantize opt model inflight"), ( diff --git a/tests/models/registry.py b/tests/models/registry.py index 17b1d7b527..7b5977ec58 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -296,6 +296,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "random": "ai21labs/Jamba-tiny-random", }, ), + "KimiLinearForCausalLM": _HfExamplesInfo( + "moonshotai/Kimi-Linear-48B-A3B-Instruct", trust_remote_code=True + ), "Lfm2ForCausalLM": _HfExamplesInfo("LiquidAI/LFM2-1.2B"), "Lfm2MoeForCausalLM": _HfExamplesInfo( "LiquidAI/LFM2-8B-A1B", min_transformers_version="4.58" @@ -366,6 +369,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "OrionForCausalLM": _HfExamplesInfo( "OrionStarAI/Orion-14B-Chat", trust_remote_code=True ), + "OuroForCausalLM": _HfExamplesInfo("ByteDance/Ouro-1.4B", trust_remote_code=True), "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"), "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"), "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"), diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 7f863a169d..bb3572752d 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -49,7 +49,18 @@ def test_model_load_and_run( KV_CACHE_MODELS = [ # AutoFP8 format using separate .k_scale and .v_scale - "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", + # The original checkpoint below was removed from the Hub. To unblock CI and + # until a small replacement with split K/V scales is found, skip this case. + # See PR #27717 for context. + pytest.param( + "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", + marks=pytest.mark.skip( + reason=( + "Checkpoint removed from HF; temporarily disabling this " + "AutoFP8 split K/V case (PR #27717)." + ) + ), + ), ] diff --git a/tests/standalone_tests/pytorch_nightly_dependency.sh b/tests/standalone_tests/pytorch_nightly_dependency.sh index cb531e13ec..fd93ad76be 100644 --- a/tests/standalone_tests/pytorch_nightly_dependency.sh +++ b/tests/standalone_tests/pytorch_nightly_dependency.sh @@ -37,6 +37,6 @@ if diff before.txt after.txt; then else echo "torch version overridden by nightly_torch_test.txt, \ if the dependency is not triggered by the pytroch nightly test,\ - please add the dependency to the list 'white_list' in tools/generate_nightly_torch_test.py" + please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py" exit 1 fi diff --git a/tests/test_envs.py b/tests/test_envs.py index 023767505f..841d7945f9 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -10,6 +10,7 @@ import vllm.envs as envs from vllm.envs import ( enable_envs_cache, env_list_with_choices, + env_set_with_choices, env_with_choices, environment_variables, ) @@ -257,3 +258,110 @@ class TestEnvListWithChoices: with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}): env_func = env_list_with_choices("TEST_ENV", [], ["option1", "option2"]) assert env_func() == ["option1", "option1", "option2"] + + +class TestEnvSetWithChoices: + """Test cases for env_set_with_choices function.""" + + def test_default_list_returned_when_env_not_set(self): + """Test that default list is returned when env var is not set.""" + env_func = env_set_with_choices( + "NONEXISTENT_ENV", ["default1", "default2"], ["option1", "option2"] + ) + assert env_func() == {"default1", "default2"} + + def test_empty_default_list_returned_when_env_not_set(self): + """Test that empty default list is returned when env not set.""" + env_func = env_set_with_choices("NONEXISTENT_ENV", [], ["option1", "option2"]) + assert env_func() == set() + + def test_single_valid_value_parsed_correctly(self): + """Test that single valid value is parsed correctly.""" + with patch.dict(os.environ, {"TEST_ENV": "option1"}): + env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"]) + assert env_func() == {"option1"} + + def test_multiple_valid_values_parsed_correctly(self): + """Test that multiple valid values are parsed correctly.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,option2"}): + env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"]) + assert env_func() == {"option1", "option2"} + + def test_values_with_whitespace_trimmed(self): + """Test that values with whitespace are trimmed correctly.""" + with patch.dict(os.environ, {"TEST_ENV": " option1 , option2 "}): + env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"]) + assert env_func() == {"option1", "option2"} + + def test_empty_values_filtered_out(self): + """Test that empty values are filtered out.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,,option2,"}): + env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"]) + assert env_func() == {"option1", "option2"} + + def test_empty_string_returns_default(self): + """Test that empty string returns default.""" + with patch.dict(os.environ, {"TEST_ENV": ""}): + env_func = env_set_with_choices( + "TEST_ENV", ["default"], ["option1", "option2"] + ) + assert env_func() == {"default"} + + def test_only_commas_returns_default(self): + """Test that string with only commas returns default.""" + with patch.dict(os.environ, {"TEST_ENV": ",,,"}): + env_func = env_set_with_choices( + "TEST_ENV", ["default"], ["option1", "option2"] + ) + assert env_func() == {"default"} + + def test_case_sensitive_validation(self): + """Test case sensitive validation.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,OPTION2"}): + env_func = env_set_with_choices( + "TEST_ENV", [], ["option1", "option2"], case_sensitive=True + ) + with pytest.raises(ValueError, match="Invalid value 'OPTION2' in TEST_ENV"): + env_func() + + def test_case_insensitive_validation(self): + """Test case insensitive validation.""" + with patch.dict(os.environ, {"TEST_ENV": "OPTION1,option2"}): + env_func = env_set_with_choices( + "TEST_ENV", [], ["option1", "option2"], case_sensitive=False + ) + assert env_func() == {"OPTION1", "option2"} + + def test_invalid_value_in_list_raises_error(self): + """Test that invalid value in list raises ValueError.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,invalid,option2"}): + env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"]) + with pytest.raises(ValueError, match="Invalid value 'invalid' in TEST_ENV"): + env_func() + + def test_callable_choices_resolved_correctly(self): + """Test that callable choices are resolved correctly.""" + + def get_choices(): + return ["dynamic1", "dynamic2"] + + with patch.dict(os.environ, {"TEST_ENV": "dynamic1,dynamic2"}): + env_func = env_set_with_choices("TEST_ENV", [], get_choices) + assert env_func() == {"dynamic1", "dynamic2"} + + def test_callable_choices_with_invalid_value(self): + """Test that callable choices raise error for invalid values.""" + + def get_choices(): + return ["dynamic1", "dynamic2"] + + with patch.dict(os.environ, {"TEST_ENV": "dynamic1,invalid"}): + env_func = env_set_with_choices("TEST_ENV", [], get_choices) + with pytest.raises(ValueError, match="Invalid value 'invalid' in TEST_ENV"): + env_func() + + def test_duplicate_values_deduped(self): + """Test that duplicate values in the list are deduped.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}): + env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"]) + assert env_func() == {"option1", "option2"} diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py index e73d7efc14..7812562c89 100644 --- a/tests/test_pooling_params.py +++ b/tests/test_pooling_params.py @@ -17,7 +17,7 @@ EMBEDDING_MODELS = [ ), ] -classify_parameters = ["activation"] +classify_parameters = ["use_activation"] embed_parameters = ["dimensions", "normalize"] step_pooling_parameters = ["step_tag_id", "returned_token_ids"] @@ -88,13 +88,13 @@ def test_embed_dimensions(model_info: EmbedModelInfo): def test_classify(task): model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS")) - pooling_params = PoolingParams(activation=None) + pooling_params = PoolingParams(use_activation=None) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=True) + pooling_params = PoolingParams(use_activation=True) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=False) + pooling_params = PoolingParams(use_activation=False) pooling_params.verify(task=task, model_config=model_config) invalid_parameters = embed_parameters + step_pooling_parameters @@ -137,13 +137,13 @@ def test_token_classify(pooling_type: str): pooler_config=PoolerConfig(pooling_type=pooling_type) ) - pooling_params = PoolingParams(activation=None) + pooling_params = PoolingParams(use_activation=None) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=True) + pooling_params = PoolingParams(use_activation=True) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=False) + pooling_params = PoolingParams(use_activation=False) pooling_params.verify(task=task, model_config=model_config) invalid_parameters = embed_parameters diff --git a/tests/tools/test_config_validator.py b/tests/tools/test_config_validator.py index 22d838d272..d6104dc6d2 100644 --- a/tests/tools/test_config_validator.py +++ b/tests/tools/test_config_validator.py @@ -5,7 +5,7 @@ import ast import pytest -from tools.validate_config import validate_ast +from tools.pre_commit.validate_config import validate_ast _TestConfig1 = """ @config diff --git a/tests/v1/attention/test_batch_reordering.py b/tests/v1/attention/test_batch_reordering.py index b271409b92..e372194542 100644 --- a/tests/v1/attention/test_batch_reordering.py +++ b/tests/v1/attention/test_batch_reordering.py @@ -53,7 +53,7 @@ REORDER_TEST_CASES = { expected_modified=True, ), "already_ordered": ReorderTestCase( - requests=[(1, 10), (1, 20), (100, 100), (200, 200)], + requests=[(1, 10), (1, 20), (100, 100), (200, 0)], expected_order=[0, 1, 2, 3], expected_modified=False, ), @@ -74,15 +74,30 @@ REORDER_TEST_CASES = { expected_modified=True, ), "decode_extend_prefill": ReorderTestCase( - requests=[(100, 100), (10, 50), (1, 10)], + requests=[(100, 0), (10, 50), (1, 10)], expected_order=[2, 1, 0], expected_modified=True, ), "extend_prefill_only": ReorderTestCase( - requests=[(100, 100), (10, 50), (200, 200), (20, 75)], + requests=[(100, 0), (10, 50), (200, 0), (20, 75)], expected_order=[3, 1, 2, 0], # Only swap 0↔3, keep 1 and 2 in place expected_modified=True, ), + "complicated_mixed_interleaved": ReorderTestCase( + requests=[ + (1, 20), + (1, 50), + (374, 0), + (300, 20), + (1, 20), + (256, 0), + (1, 5), + (27, 0), + (1, 4), + ], + expected_order=[0, 1, 6, 8, 4, 3, 2, 7, 5], + expected_modified=True, + ), } diff --git a/tests/v1/e2e/test_async_sched_and_preempt.py b/tests/v1/e2e/test_async_sched_and_preempt.py index 7ad9606a66..15a1cc2558 100644 --- a/tests/v1/e2e/test_async_sched_and_preempt.py +++ b/tests/v1/e2e/test_async_sched_and_preempt.py @@ -6,6 +6,7 @@ import pytest import torch._dynamo.config as dynamo_config from vllm import SamplingParams +from vllm.logprobs import Logprob from ...conftest import VllmRunner from ...models.utils import check_outputs_equal @@ -32,6 +33,8 @@ def test_preempt_and_async_scheduling_e2e(monkeypatch: pytest.MonkeyPatch): # dict(min_tokens=20), dict(presence_penalty=-1.0), dict(bad_words=["the", " the"]), + dict(logprobs=2), + dict(logprobs=2, presence_penalty=-1.0), ] default_params = dict( @@ -77,29 +80,33 @@ def test_preempt_and_async_scheduling_e2e(monkeypatch: pytest.MonkeyPatch): sampling_params=SamplingParams( **default_params, **override_params ), + return_logprobs=True, ) ) if not outputs: # First check that the different parameter configs # actually result in different output. - for other_test, params in zip( + for (other_test_outs, other_test_logprobs), params in zip( results[1:], sampling_param_tests[1:] ): with pytest.raises(AssertionError): check_outputs_equal( - outputs_0_lst=results[0], - outputs_1_lst=other_test, + outputs_0_lst=results[0][0], + outputs_1_lst=other_test_outs, name_0=f"baseline params={params}", name_1=f"other params={params}", ) + assert _all_logprobs_match( + results[0][1], other_test_logprobs + ) outputs.append((test_config, results)) baseline_config, baseline_tests = outputs[0] for test_config, test_outputs in outputs[1:]: - for base_outs, test_outs, params in zip( + for (base_outs, base_logprobs), (test_outs, test_logprobs), params in zip( baseline_tests, test_outputs, sampling_param_tests ): check_outputs_equal( @@ -108,5 +115,27 @@ def test_preempt_and_async_scheduling_e2e(monkeypatch: pytest.MonkeyPatch): name_0=f"baseline=[{baseline_config}], params={params}", name_1=f"config=[{test_config}], params={params}", ) + assert _all_logprobs_match(base_logprobs, test_logprobs) print(f"PASSED: config=[{test_config}], params={params}") + + +def _all_logprobs_match(req_a, req_b) -> bool: + return ( + req_a == req_b + or len(req_a) == len(req_b) + and all( + len(seq_a) == len(seq_b) + and all(_logprobs_match(a, b) for a, b in zip(seq_a, seq_b)) + for seq_a, seq_b in zip(req_a, req_b) + ) + ) + + +def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool: + return len(lps_a) == len(lps_b) and all( + a.decoded_token == b.decoded_token + and a.rank == b.rank + and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6) + for a, b in ((lps_a[x], lps_b[x]) for x in lps_a) + ) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 7dbdf0ca07..45b48e5858 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -121,6 +121,86 @@ def test_ngram_correctness( cleanup_dist_env_and_memory() +@pytest.mark.parametrize( + "model_path", + [ + "RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3", + "RedHatAI/Qwen3-8B-speculator.eagle3", + ], + ids=["llama3_eagle3_speculator", "qwen3_eagle3_speculator"], +) +def test_speculators_model_integration( + monkeypatch: pytest.MonkeyPatch, + sampling_config: SamplingParams, + model_path: str, +): + """ + Test that speculators models work with the simplified integration. + + This verifies the `vllm serve ` use case where + speculative config is automatically detected from the model config + without requiring explicit --speculative-config argument. + + Tests: + 1. Speculator model is correctly detected + 2. Verifier model is extracted from speculator config + 3. Speculative decoding is automatically enabled + 4. Text generation works correctly + 5. Output matches reference (non-speculative) generation + """ + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + # Generate test prompts + test_prompts = get_test_prompts(mm_enabled=False) + + # First run: Direct speculator model (simplified integration) + spec_llm = LLM(model=model_path, max_model_len=1024) + spec_outputs = spec_llm.chat(test_prompts, sampling_config) + + # Verify speculative config was auto-detected + assert spec_llm.llm_engine.vllm_config.speculative_config is not None, ( + f"Speculative config should be auto-detected for {model_path}" + ) + + spec_config = spec_llm.llm_engine.vllm_config.speculative_config + assert spec_config.num_speculative_tokens > 0, ( + f"Expected positive speculative tokens, " + f"got {spec_config.num_speculative_tokens}" + ) + + # Verify draft model is set to the speculator model + assert spec_config.model == model_path, ( + f"Draft model should be {model_path}, got {spec_config.model}" + ) + + # Extract verifier model for reference run + verifier_model = spec_llm.llm_engine.vllm_config.model_config.model + + del spec_llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() + + # Second run: Reference without speculative decoding + ref_llm = LLM(model=verifier_model, max_model_len=1024) + ref_outputs = ref_llm.chat(test_prompts, sampling_config) + del ref_llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() + + # Compare outputs + matches = sum( + 1 + for ref, spec in zip(ref_outputs, spec_outputs) + if ref.outputs[0].text == spec.outputs[0].text + ) + + # Heuristic: expect at least 66% of prompts to match exactly + assert matches >= int(0.66 * len(ref_outputs)), ( + f"Only {matches}/{len(ref_outputs)} outputs matched. " + f"Expected at least {int(0.66 * len(ref_outputs))} matches." + ) + + @pytest.mark.parametrize( ["model_setup", "mm_enabled"], [ diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py index ad7594a3dd..032ed42f43 100644 --- a/tests/v1/entrypoints/openai/responses/conftest.py +++ b/tests/v1/entrypoints/openai/responses/conftest.py @@ -6,7 +6,7 @@ import pytest_asyncio from tests.utils import RemoteOpenAIServer # Use a small reasoning model to test the responses API. -MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_NAME = "Qwen/Qwen3-1.7B" @pytest.fixture(scope="module") diff --git a/tests/v1/generation/test_batch_invariance.py b/tests/v1/generation/test_batch_invariance.py index 8e59b695ed..f05fac2478 100644 --- a/tests/v1/generation/test_batch_invariance.py +++ b/tests/v1/generation/test_batch_invariance.py @@ -17,16 +17,10 @@ skip_unsupported = pytest.mark.skipif( @pytest.fixture(autouse=True) -def enable_batch_invariant_mode(): +def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch): """Automatically enable batch invariant kernel overrides for all tests.""" - old_value = os.environ.get("VLLM_BATCH_INVARIANT") - os.environ["VLLM_BATCH_INVARIANT"] = "1" + monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1") yield - # Restore original value after test - if old_value is None: - os.environ.pop("VLLM_BATCH_INVARIANT", None) - else: - os.environ["VLLM_BATCH_INVARIANT"] = old_value def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str: @@ -76,7 +70,13 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str: @skip_unsupported @pytest.mark.timeout(1000) -def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(): +@pytest.mark.parametrize( + "backend", + ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], +) +def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( + backend, monkeypatch: pytest.MonkeyPatch +): """ Ensures that the same request (the 'needle' prompt) yields identical output whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64), @@ -101,6 +101,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(): seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) # Allow overrides from environment (useful for CI tuning) # "facebook/opt-125m" is too small, doesn't reliably test determinism model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") @@ -220,11 +221,15 @@ def _extract_step_logprobs(request_output): @skip_unsupported -@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER"]) +@pytest.mark.parametrize( + "backend", + ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], +) @pytest.mark.forked -def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(backend): - backend = os.getenv("VLLM_ATTENTION_BACKEND", backend) - os.environ["VLLM_ATTENTION_BACKEND"] = backend +def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( + backend, monkeypatch: pytest.MonkeyPatch +): + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) @@ -435,11 +440,16 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(backend): @skip_unsupported -def test_simple_generation(): +@pytest.mark.parametrize( + "backend", + ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], +) +def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): """ Simple test that runs the model with a basic prompt and prints the output. Useful for quick smoke testing and debugging. """ + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") llm = LLM( @@ -481,9 +491,14 @@ def test_simple_generation(): @skip_unsupported -@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER"]) +@pytest.mark.parametrize( + "backend", + ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], +) @pytest.mark.forked -def test_logprobs_WITHOUT_batch_invariance_should_FAIL(backend): +def test_logprobs_without_batch_invariance_should_fail( + backend, monkeypatch: pytest.MonkeyPatch +): """ This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN. It DISABLES batch invariance mode and expects to see non-deterministic behavior @@ -493,224 +508,214 @@ def test_logprobs_WITHOUT_batch_invariance_should_FAIL(backend): The test will PASS if we detect differences (proving batch invariance matters). The test will FAIL if everything matches (suggesting batch invariance isn't needed). """ - backend = os.getenv("VLLM_ATTENTION_BACKEND", backend) - os.environ["VLLM_ATTENTION_BACKEND"] = backend + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) # CRITICAL: Disable batch invariance for this test - old_value = os.environ.get("VLLM_BATCH_INVARIANT") - os.environ["VLLM_BATCH_INVARIANT"] = "0" + monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0") - try: - seed = int(os.getenv("VLLM_TEST_SEED", "12345")) - random.seed(seed) - model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") - tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1")) + seed = int(os.getenv("VLLM_TEST_SEED", "12345")) + random.seed(seed) + model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1")) - print(f"\n{'=' * 80}") - print("BATCH INVARIANCE DISABLED: Expecting non-deterministic behavior") + print(f"\n{'=' * 80}") + print("BATCH INVARIANCE DISABLED: Expecting non-deterministic behavior") + print(f"{'=' * 80}\n") + + llm = LLM( + model=model_name, + tensor_parallel_size=tp_size, + enable_prefix_caching=False, + max_num_seqs=32, + max_model_len=8192, + dtype="bfloat16", + ) + + # build ragged prompts to change shapes significantly across BS=1 vs BS=N + long_min = int(os.getenv("VLLM_MIN_PROMPT", "768")) + long_max = int(os.getenv("VLLM_MAX_PROMPT", "2048")) + prompts: list[str] = [] + options = [ + (max(long_min, 1536), max(long_max, 3072)), # very long + (max(1024, long_min), max(2048, long_max)), # long + (256, 512), # mid + (10, 20), # short + ] + + for _ in range(32): + lo, hi = random.choice(options) + prompts.append(_random_prompt(lo, hi)) + + sp = SamplingParams( + temperature=0.6, + top_p=1.0, + max_tokens=8, + seed=1234, + logprobs=5, + ) + + # BS=1: run prompts individually and collect logprobs per step. + print("\n" + "=" * 80) + print("STARTING BS=1 RUNS (each prompt individually)") + print("=" * 80 + "\n") + + bs1_logprobs_per_prompt = [] + bs1_tokens_per_prompt = [] + for idx, p in enumerate(prompts): + print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...") + outs = llm.generate([p], sp, use_tqdm=False) + assert len(outs) == 1 + step_logprobs, token_ids = _extract_step_logprobs(outs[0]) + if step_logprobs is None: + pytest.skip( + "Logits are not available on RequestOutput; " + "enable logprobs return to run this test." + ) + bs1_logprobs_per_prompt.append(step_logprobs) + bs1_tokens_per_prompt.append(token_ids) + print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}") + + # BS=N: run prompts in a batch and collect logprobs per step for each prompt. + print("\n" + "=" * 80) + print(f"STARTING BS={len(prompts)} RUN (all prompts batched)") + print("=" * 80 + "\n") + + outs_batched = llm.generate(prompts, sp, use_tqdm=False) + assert len(outs_batched) == len(prompts) + bsN_logprobs_per_prompt = [] + bsN_tokens_per_prompt = [] + + print(f"\n[BS={len(prompts)}] Processing batched outputs...") + for idx, o in enumerate(outs_batched): + tokens = o.outputs[0].token_ids if o.outputs else "N/A" + print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}") + step_logprobs, token_ids = _extract_step_logprobs(o) + if step_logprobs is None: + pytest.skip( + "Logits are not available on RequestOutput; " + "enable logprobs return to run this test." + ) + bsN_logprobs_per_prompt.append(step_logprobs) + bsN_tokens_per_prompt.append(token_ids) + + # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs. + differences_found = [] + for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate( + zip( + bs1_logprobs_per_prompt, + bsN_logprobs_per_prompt, + bs1_tokens_per_prompt, + bsN_tokens_per_prompt, + ) + ): + if len(logprobs_bs1) != len(logprobs_bsN): + reason = ( + f"Different number of steps: {len(logprobs_bs1)} (BS=1) " + f"vs {len(logprobs_bsN)} (BS=N)" + ) + differences_found.append( + { + "prompt_idx": i, + "step": "all", + "reason": reason, + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) + continue + + # Check if tokens match first + if tokens_bs1 != tokens_bsN: + differences_found.append( + { + "prompt_idx": i, + "step": "sampling", + "reason": "Different tokens sampled", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) + continue + + for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): + if a.shape != b.shape: + differences_found.append( + { + "prompt_idx": i, + "step": t, + "reason": f"Shape mismatch: {a.shape} vs {b.shape}", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) + break + + if not torch.equal(a, b): + max_diff = torch.abs(a - b).max().item() + print( + f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, " + f"Token {t}: max_diff={max_diff:.6e}" + ) + bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A" + bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A" + print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}") + print(f" BS=1 logprob: {a.tolist()}") + print(f" BS=N logprob: {b.tolist()}") + differences_found.append( + { + "prompt_idx": i, + "step": t, + "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) + break + + # Print summary + print(f"\n{'=' * 80}") + if differences_found: + success_msg = ( + f"✓ SUCCESS: Batch invariance is doing something! " + f"Found {len(differences_found)}/{len(prompts)} prompts " + f"with differences when batch invariance was DISABLED." + ) + print(success_msg) + print(f"{'=' * 80}") + for diff in differences_found: + print(f"\nPrompt {diff['prompt_idx']} (step {diff['step']}):") + print(f" Reason: {diff['reason']}") + print(f" Preview: {diff['prompt_preview']}...") + if "bs1_tokens" in diff: + print(f" BS=1 tokens: {diff['bs1_tokens']}") + if "bsN_tokens" in diff: + print(f" BS=N tokens: {diff['bsN_tokens']}") print(f"{'=' * 80}\n") - - llm = LLM( - model=model_name, - tensor_parallel_size=tp_size, - enable_prefix_caching=False, - max_num_seqs=32, - max_model_len=8192, - dtype="bfloat16", + # Test PASSES because we found differences (batch invariance matters!) + return + else: + # Test FAILS because everything matched even without batch invariance + fail_msg = ( + f"✗ UNEXPECTED: All {len(prompts)} prompts matched " + f"between BS=1 and BS=N even with batch invariance DISABLED. " + f"This suggests batch invariance might not be necessary, " + f"or the test needs more sensitive prompts." ) - - # build ragged prompts to change shapes significantly across BS=1 vs BS=N - long_min = int(os.getenv("VLLM_MIN_PROMPT", "768")) - long_max = int(os.getenv("VLLM_MAX_PROMPT", "2048")) - prompts: list[str] = [] - options = [ - (max(long_min, 1536), max(long_max, 3072)), # very long - (max(1024, long_min), max(2048, long_max)), # long - (256, 512), # mid - (10, 20), # short - ] - - for _ in range(32): - lo, hi = random.choice(options) - prompts.append(_random_prompt(lo, hi)) - - sp = SamplingParams( - temperature=0.6, - top_p=1.0, - max_tokens=8, - seed=1234, - logprobs=5, - ) - - # BS=1: run prompts individually and collect logprobs per step. - print("\n" + "=" * 80) - print("STARTING BS=1 RUNS (each prompt individually)") - print("=" * 80 + "\n") - - bs1_logprobs_per_prompt = [] - bs1_tokens_per_prompt = [] - for idx, p in enumerate(prompts): - print( - f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..." - ) - outs = llm.generate([p], sp, use_tqdm=False) - assert len(outs) == 1 - step_logprobs, token_ids = _extract_step_logprobs(outs[0]) - if step_logprobs is None: - pytest.skip( - "Logits are not available on RequestOutput; " - "enable logprobs return to run this test." - ) - bs1_logprobs_per_prompt.append(step_logprobs) - bs1_tokens_per_prompt.append(token_ids) - print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}") - - # BS=N: run prompts in a batch and collect logprobs per step for each prompt. - print("\n" + "=" * 80) - print(f"STARTING BS={len(prompts)} RUN (all prompts batched)") - print("=" * 80 + "\n") - - outs_batched = llm.generate(prompts, sp, use_tqdm=False) - assert len(outs_batched) == len(prompts) - bsN_logprobs_per_prompt = [] - bsN_tokens_per_prompt = [] - - print(f"\n[BS={len(prompts)}] Processing batched outputs...") - for idx, o in enumerate(outs_batched): - tokens = o.outputs[0].token_ids if o.outputs else "N/A" - print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}") - step_logprobs, token_ids = _extract_step_logprobs(o) - if step_logprobs is None: - pytest.skip( - "Logits are not available on RequestOutput; " - "enable logprobs return to run this test." - ) - bsN_logprobs_per_prompt.append(step_logprobs) - bsN_tokens_per_prompt.append(token_ids) - - # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs. - differences_found = [] - for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate( - zip( - bs1_logprobs_per_prompt, - bsN_logprobs_per_prompt, - bs1_tokens_per_prompt, - bsN_tokens_per_prompt, - ) - ): - if len(logprobs_bs1) != len(logprobs_bsN): - reason = ( - f"Different number of steps: {len(logprobs_bs1)} (BS=1) " - f"vs {len(logprobs_bsN)} (BS=N)" - ) - differences_found.append( - { - "prompt_idx": i, - "step": "all", - "reason": reason, - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - } - ) - continue - - # Check if tokens match first - if tokens_bs1 != tokens_bsN: - differences_found.append( - { - "prompt_idx": i, - "step": "sampling", - "reason": "Different tokens sampled", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - } - ) - continue - - for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): - if a.shape != b.shape: - differences_found.append( - { - "prompt_idx": i, - "step": t, - "reason": f"Shape mismatch: {a.shape} vs {b.shape}", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - } - ) - break - - if not torch.equal(a, b): - max_diff = torch.abs(a - b).max().item() - print( - f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, " - f"Token {t}: max_diff={max_diff:.6e}" - ) - bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A" - bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A" - print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}") - print(f" BS=1 logprob: {a.tolist()}") - print(f" BS=N logprob: {b.tolist()}") - differences_found.append( - { - "prompt_idx": i, - "step": t, - "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - } - ) - break - - # Print summary - print(f"\n{'=' * 80}") - if differences_found: - success_msg = ( - f"✓ SUCCESS: Batch invariance is doing something! " - f"Found {len(differences_found)}/{len(prompts)} prompts " - f"with differences when batch invariance was DISABLED." - ) - print(success_msg) - print(f"{'=' * 80}") - for diff in differences_found: - print(f"\nPrompt {diff['prompt_idx']} (step {diff['step']}):") - print(f" Reason: {diff['reason']}") - print(f" Preview: {diff['prompt_preview']}...") - if "bs1_tokens" in diff: - print(f" BS=1 tokens: {diff['bs1_tokens']}") - if "bsN_tokens" in diff: - print(f" BS=N tokens: {diff['bsN_tokens']}") - print(f"{'=' * 80}\n") - # Test PASSES because we found differences (batch invariance matters!) - return - else: - # Test FAILS because everything matched even without batch invariance - fail_msg = ( - f"✗ UNEXPECTED: All {len(prompts)} prompts matched " - f"between BS=1 and BS=N even with batch invariance DISABLED. " - f"This suggests batch invariance might not be necessary, " - f"or the test needs more sensitive prompts." - ) - print(fail_msg) - print(f"{'=' * 80}\n") - pytest.fail(fail_msg) - - finally: - # Restore original value - if old_value is None: - os.environ.pop("VLLM_BATCH_INVARIANT", None) - else: - os.environ["VLLM_BATCH_INVARIANT"] = old_value + print(fail_msg) + print(f"{'=' * 80}\n") + pytest.fail(fail_msg) @skip_unsupported @pytest.mark.parametrize("backend", ["FLASH_ATTN"]) @pytest.mark.forked -def test_decode_logprobs_match_prefill_logprobs(backend): +def test_decode_logprobs_match_prefill_logprobs( + backend, monkeypatch: pytest.MonkeyPatch +): """ Test that verifies decode logprobs match prefill logprobs. @@ -724,8 +729,7 @@ def test_decode_logprobs_match_prefill_logprobs(backend): This ensures that the logprobs from decode are consistent with what we would get if we ran prefill on each prefix. """ - backend = os.getenv("VLLM_ATTENTION_BACKEND", backend) - os.environ["VLLM_ATTENTION_BACKEND"] = backend + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index e9c255b1ee..b654ea4298 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -1,15 +1,68 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import socket import time +import msgspec +import msgspec.msgpack import pytest +import zmq +from tqdm import tqdm -from vllm import LLM, SamplingParams -from vllm.config import KVTransferConfig +from vllm import LLM, SamplingParams, TokensPrompt +from vllm.config import KVEventsConfig, KVTransferConfig +from vllm.distributed.kv_events import BlockStored, KVEventBatch CPU_BLOCK_SIZES = [16, 48] +class MockSubscriber: + """Helper class to receive and verify published events""" + + def __init__( + self, + endpoint: str, + topic: str, + ): + self.ctx = zmq.Context.instance() + self.topic_bytes = topic.encode("utf-8") + + # Set up subscriber socket + self.sub = self.ctx.socket(zmq.SUB) + self.sub.setsockopt(zmq.SUBSCRIBE, self.topic_bytes) + self.sub.connect(endpoint) + + self.decoder = msgspec.msgpack.Decoder(type=KVEventBatch) + + def get_new_cpu_stored_events(self) -> list[BlockStored]: + cpu_stored_events: list[BlockStored] = [] + + poller = zmq.Poller() + poller.register(self.sub, zmq.POLLIN) + + timeout = 1000 # 1 second + while True: + events = dict(poller.poll(timeout)) + + if events.get(self.sub) != zmq.POLLIN: + return cpu_stored_events + + topic_bytes, _, payload = self.sub.recv_multipart() + + assert topic_bytes == self.topic_bytes + + event_batch = self.decoder.decode(payload) + assert isinstance(event_batch, KVEventBatch) + for event in event_batch.events: + if isinstance(event, BlockStored) and event.medium == "CPU": + cpu_stored_events.append(event) + timeout = 100 + + def close(self): + """Clean up resources""" + self.sub.close() + + @pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES) def test_cpu_offloading(cpu_block_size: int) -> None: """ @@ -20,41 +73,80 @@ def test_cpu_offloading(cpu_block_size: int) -> None: kv_transfer_config = KVTransferConfig( kv_connector="OffloadingConnector", kv_role="kv_both", - kv_connector_extra_config={"num_cpu_blocks": 100, "block_size": cpu_block_size}, + kv_connector_extra_config={ + "num_cpu_blocks": 1000, + "block_size": cpu_block_size, + }, + ) + + port: int + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("0.0.0.0", 0)) + port = s.getsockname()[1] + + events_endpoint = f"tcp://*:{port}" + kv_events_config = KVEventsConfig( + enable_kv_cache_events=True, + publisher="zmq", + endpoint=events_endpoint, + topic="test", ) llm = LLM( model="meta-llama/Llama-3.2-1B-Instruct", gpu_memory_utilization=0.5, + kv_events_config=kv_events_config, kv_transfer_config=kv_transfer_config, - disable_hybrid_kv_cache_manager=True, ) - prompts = ["Hi " * 100] - sampling_params = SamplingParams(temperature=0, max_tokens=20) + sampling_params = SamplingParams(temperature=0, max_tokens=1) - # run generation - this should trigger saving KV cache - start_time = time.time() - llm.generate(prompts, sampling_params, use_tqdm=False) - cold_time = time.time() - start_time + events_endpoint = events_endpoint.replace("*", "127.0.0.1") + subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic) - # run generation again - should hit the GPU prefix cache - start_time = time.time() - llm.generate(prompts, sampling_params, use_tqdm=False) - gpu_hit_time = time.time() - start_time + try: + num_times_cpu_better_than_cold = 0 + num_tests = 10 + total_cold_time = 0.0 + total_gpu_hit_time = 0.0 + total_cpu_hit_time = 0.0 + prompt_token_ids = [0] * 10001 + for i in tqdm(range(num_tests), desc="Running tests"): + prompt_token_ids[0] = i + prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)] - # reset prefix cache to avoid GPU hit. - llm.reset_prefix_cache() + # run generation - this should trigger saving KV cache + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + cold_time = time.time() - start_time + total_cold_time += cold_time - # sleep for a sec to make sure CPU finished storing - time.sleep(1) + # run generation again - should hit the GPU prefix cache + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + gpu_hit_time = time.time() - start_time + total_gpu_hit_time += gpu_hit_time - # run generation again - this should trigger loading from CPU - start_time = time.time() - llm.generate(prompts, sampling_params, use_tqdm=False) - cpu_hit_time = time.time() - start_time + # reset prefix cache to avoid GPU hit. + llm.reset_prefix_cache() - print("Generation times:") - print(f" Cold: {cold_time * 1000:.2f}ms") - print(f" GPU hit: {gpu_hit_time * 1000:.2f}ms") - print(f" CPU hit: {cpu_hit_time * 1000:.2f}ms") + assert subscriber.get_new_cpu_stored_events() + + # run generation again - this should trigger loading from CPU + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + cpu_hit_time = time.time() - start_time + total_cpu_hit_time += cpu_hit_time + + if cpu_hit_time < cold_time: + num_times_cpu_better_than_cold += 1 + + print("Average times:") + print(f" Cold: {total_cold_time * 1000 / num_tests:.2f}ms") + print(f" GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms") + print(f" CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms") + + assert num_times_cpu_better_than_cold >= 0.8 * num_tests + finally: + subscriber.close() + del llm diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/v1/spec_decode/test_speculators_eagle3.py similarity index 94% rename from tests/speculative_decoding/speculators/test_eagle3.py rename to tests/v1/spec_decode/test_speculators_eagle3.py index 19ba32d8de..5ce6e1593b 100644 --- a/tests/speculative_decoding/speculators/test_eagle3.py +++ b/tests/v1/spec_decode/test_speculators_eagle3.py @@ -22,10 +22,6 @@ from vllm.model_executor.models.interfaces import supports_eagle3 "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized-w4a16", id="qwen3-eagle3-speculator-w4a16-verifier", ), - pytest.param( - "nm-testing/random-weights-llama3.1.8b-2layer-eagle3", - id="llama3-eagl3-multiple-layers", - ), ], ) def test_eagle3_speculators_model( diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 1aa0709696..18aa599f1a 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -212,10 +212,12 @@ def test_update_states_request_resumed(model_runner): # resume req cached_req_data = CachedRequestData( req_ids=[req_id], - resumed_from_preemption=[False], + resumed_req_ids={req_id}, new_token_ids=[[]], + all_token_ids={req_id: scheduler_output.scheduled_new_reqs[0].prompt_token_ids}, new_block_ids=[([],)], num_computed_tokens=[0], + num_output_tokens=[0], ) scheduler_output = SchedulerOutput( diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index c2c34ee95a..9007436350 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -259,10 +259,10 @@ def test_update_states_request_resumed(model_runner, dist_init): # resume req cached_req_data = CachedRequestData( req_ids=[req_id], - resumed_from_preemption=[False], + resumed_req_ids=set(), new_token_ids=[[]], - resumed_req_token_ids=[None], - new_block_ids=([[0]],), + all_token_ids={}, + new_block_ids=[([0],)], num_computed_tokens=[0], num_output_tokens=[0], ) diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py index c808b01d2e..742aab6b0d 100644 --- a/tools/install_nixl_from_source_ubuntu.py +++ b/tools/install_nixl_from_source_ubuntu.py @@ -37,7 +37,7 @@ def is_pip_package_installed(package_name): def find_nixl_wheel_in_cache(cache_dir): """Finds a nixl wheel file in the specified cache directory.""" # The repaired wheel will have a 'manylinux' tag, but this glob still works. - search_pattern = os.path.join(cache_dir, "nixl-*.whl") + search_pattern = os.path.join(cache_dir, "nixl*.whl") wheels = glob.glob(search_pattern) if wheels: # Sort to get the most recent/highest version if multiple exist diff --git a/tools/check_init_lazy_imports.py b/tools/pre_commit/check_init_lazy_imports.py similarity index 96% rename from tools/check_init_lazy_imports.py rename to tools/pre_commit/check_init_lazy_imports.py index 8b3a0b2a71..ab2ef8b3aa 100644 --- a/tools/check_init_lazy_imports.py +++ b/tools/pre_commit/check_init_lazy_imports.py @@ -6,13 +6,12 @@ i.e: appears only within the `if typing.TYPE_CHECKING:` guard, """ import ast -import pathlib import sys from collections.abc import Iterable +from pathlib import Path from typing import Final -REPO_ROOT: Final = pathlib.Path(__file__).resolve().parent.parent -INIT_PATH: Final = REPO_ROOT / "vllm" / "__init__.py" +INIT_PATH: Final = Path("vllm/__init__.py") # If you need to add items to whitelist, do it here. ALLOWED_IMPORTS: Final[frozenset[str]] = frozenset( diff --git a/tools/check_spdx_header.py b/tools/pre_commit/check_spdx_header.py similarity index 100% rename from tools/check_spdx_header.py rename to tools/pre_commit/check_spdx_header.py diff --git a/tools/check_triton_import.py b/tools/pre_commit/check_triton_import.py similarity index 100% rename from tools/check_triton_import.py rename to tools/pre_commit/check_triton_import.py diff --git a/tools/enforce_regex_import.py b/tools/pre_commit/enforce_regex_import.py similarity index 100% rename from tools/enforce_regex_import.py rename to tools/pre_commit/enforce_regex_import.py diff --git a/tools/generate_nightly_torch_test.py b/tools/pre_commit/generate_nightly_torch_test.py similarity index 100% rename from tools/generate_nightly_torch_test.py rename to tools/pre_commit/generate_nightly_torch_test.py diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index a3aa546347..8d04848f8f 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -36,12 +36,15 @@ FILES = [ "vllm/transformers_utils", "vllm/triton_utils", "vllm/usage", + "vllm/v1/core", + "vllm/v1/engine", ] # After fixing errors resulting from changing follow_imports # from "skip" to "silent", move the following directories to FILES SEPARATE_GROUPS = [ "tests", + # v0 related "vllm/attention", "vllm/compilation", "vllm/engine", @@ -50,7 +53,16 @@ SEPARATE_GROUPS = [ "vllm/model_executor", "vllm/plugins", "vllm/worker", - "vllm/v1", + # v1 related + "vllm/v1/attention", + "vllm/v1/executor", + "vllm/v1/kv_offload", + "vllm/v1/metrics", + "vllm/v1/pool", + "vllm/v1/sample", + "vllm/v1/spec_decode", + "vllm/v1/structured_output", + "vllm/v1/worker", ] # TODO(woosuk): Include the code from Megatron and HuggingFace. diff --git a/tools/png-lint.sh b/tools/pre_commit/png-lint.sh similarity index 100% rename from tools/png-lint.sh rename to tools/pre_commit/png-lint.sh diff --git a/tools/shellcheck.sh b/tools/pre_commit/shellcheck.sh similarity index 100% rename from tools/shellcheck.sh rename to tools/pre_commit/shellcheck.sh diff --git a/tools/update-dockerfile-graph.sh b/tools/pre_commit/update-dockerfile-graph.sh similarity index 100% rename from tools/update-dockerfile-graph.sh rename to tools/pre_commit/update-dockerfile-graph.sh diff --git a/tools/validate_config.py b/tools/pre_commit/validate_config.py similarity index 100% rename from tools/validate_config.py rename to tools/pre_commit/validate_config.py diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index a049dc0425..ed4bf0beb7 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -141,7 +141,7 @@ def attempt_to_make_names_unique(entries_and_traces): """ -def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame: +def group_trace_by_operations(trace_df: "pd.DataFrame") -> "pd.DataFrame": def is_rms_norm(op_name: str): if "rms_norm_kernel" in op_name: return True @@ -370,12 +370,12 @@ def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame: def plot_trace_df( - traces_df: pd.DataFrame, + traces_df: "pd.DataFrame", plot_metric: str, plot_title: str, output: Path | None = None, ): - def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str: + def get_phase_description(traces_df: "pd.DataFrame", phase: str) -> str: phase_df = traces_df.query(f'phase == "{phase}"') descs = phase_df["phase_desc"].to_list() assert all([desc == descs[0] for desc in descs]) @@ -438,7 +438,7 @@ def main( top_k: int, json_nodes_to_fold: list[str], ): - def prepare_data(profile_json: dict, step_keys: list[str]) -> pd.DataFrame: + def prepare_data(profile_json: dict, step_keys: list[str]) -> "pd.DataFrame": def get_entries_and_traces(key: str): entries_and_traces: list[tuple[Any, Any]] = [] for root in profile_json[key]["summary_stats"]: @@ -449,8 +449,8 @@ def main( return entries_and_traces def keep_only_top_entries( - df: pd.DataFrame, metric: str, top_k: int = 9 - ) -> pd.DataFrame: + df: "pd.DataFrame", metric: str, top_k: int = 9 + ) -> "pd.DataFrame": df.loc[df.nsmallest(len(df) - top_k + 1, metric).index, ["name"]] = "others" return df diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index e773e1d13f..60ee0124c3 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -151,7 +151,9 @@ class ipex_ops: def rms_norm( input: torch.Tensor, weight: torch.Tensor, epsilon: float ) -> torch.Tensor: - return ipex.llm.functional.rms_norm(input, weight, epsilon) + out = torch.empty_like(input) + torch.ops.torch_ipex.rms_norm_vllm(out, input.contiguous(), weight, epsilon) + return out @staticmethod def fused_add_rms_norm( @@ -160,10 +162,7 @@ class ipex_ops: weight: torch.Tensor, epsilon: float, ) -> None: - tmp = ipex.llm.functional.add_rms_norm( - residual, input, weight, None, epsilon, True - ) - input.copy_(tmp) + torch.ops.torch_ipex.fused_add_rms_norm_vllm(input, residual, weight, epsilon) @staticmethod def varlen_attention( @@ -296,16 +295,6 @@ class ipex_ops: num_splits=0, s_aux: torch.Tensor | None = None, ): - if cu_seqlens_k is None: - # cu_seqlens_k is not used in ipex kernel. - cu_seqlens_k = torch.cumsum(seqused_k, dim=0) - cu_seqlens_k = torch.cat( - [ - torch.tensor([0], device=seqused_k.device, dtype=torch.int32), - cu_seqlens_k, - ] - ).to(torch.int32) - real_window_size: tuple[int, int] if window_size is None: real_window_size = (-1, -1) @@ -318,7 +307,7 @@ class ipex_ops: k, v, cu_seqlens_q, - cu_seqlens_k, + seqused_k, max_seqlen_q, max_seqlen_k, softmax_scale, diff --git a/vllm/assets/video.py b/vllm/assets/video.py index 8818b59970..d025368cbd 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -94,7 +94,7 @@ def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]: metadata = { "total_num_frames": num_frames, - "fps": fps, + "fps": duration / num_frames, "duration": duration, "video_backend": "opencv", "frames_indices": list(range(num_frames)), diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py new file mode 100644 index 0000000000..f71f49a1a3 --- /dev/null +++ b/vllm/attention/ops/vit_attn_wrappers.py @@ -0,0 +1,125 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This file contains ops for ViT attention to be compatible with torch.compile +as there are operations here not supported by torch.compile (for instance, +`to_list` in xformers attn, or `.item()` in flash attention) + +Using these ops and wrapping vision blocks with `torch.compile` can speed up +throughput in vision models by ~5% relative on H100, and improve token +latencies by ~7% (see qwen2_5_vl for example usage) + +To use these ops, you must have a recent version of PyTorch installed (>= 2.4.0) +""" + +import einops +import torch + +from vllm.utils.torch_utils import direct_register_custom_op + + +def xformers_attn_seqlens_wrapper( + q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor +) -> torch.Tensor: + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalMask + + attn_bias = BlockDiagonalMask.from_seqlens( + q_seqlen=seqlens.tolist(), kv_seqlen=None, device=q.device + ) + context_layer = xops.memory_efficient_attention_forward( + q, k, v, attn_bias=attn_bias, p=0, scale=None + ) + context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous() + return context_layer + + +def xformers_attn_seqlens_wrapper_fake( + q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor +) -> torch.Tensor: + b, s, h, d = q.shape + return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device) + + +direct_register_custom_op( + op_name="xformers_attn_seqlens_wrapper", + op_func=xformers_attn_seqlens_wrapper, + fake_impl=xformers_attn_seqlens_wrapper_fake, +) + + +def vit_xformers_attn_wrapper( + q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor +) -> torch.Tensor: + return torch.ops.vllm.xformers_attn_seqlens_wrapper(q, k, v, seqlens) + + +def flash_attn_maxseqlen_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor, + batch_size: int, + is_rocm_aiter: bool, + use_upstream_fa: bool, +) -> torch.Tensor: + if is_rocm_aiter: + from aiter import flash_attn_varlen_func + else: + if use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.vllm_flash_attn import flash_attn_varlen_func + q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) + output = flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen.item(), + max_seqlen_k=max_seqlen.item(), + dropout_p=0.0, + causal=False, + ) + context_layer = einops.rearrange( + output, "(b s) h d -> s b (h d)", b=batch_size + ).contiguous() + return context_layer + + +def flash_attn_maxseqlen_wrapper_fake( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor, + batch_size: int, + is_rocm_aiter: bool, + use_upstream_fa: bool, +) -> torch.Tensor: + b, s, h, d = q.shape + return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device) + + +direct_register_custom_op( + op_name="flash_attn_maxseqlen_wrapper", + op_func=flash_attn_maxseqlen_wrapper, + fake_impl=flash_attn_maxseqlen_wrapper_fake, +) + + +def vit_flash_attn_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor, + batch_size: int, + is_rocm_aiter: bool, + use_upstream_fa: bool, +) -> torch.Tensor: + return torch.ops.vllm.flash_attn_maxseqlen_wrapper( + q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa + ) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 55e24bd5d9..b1aa8530eb 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -27,8 +27,10 @@ from copy import deepcopy from dataclasses import dataclass from functools import cache from io import BytesIO +from tempfile import NamedTemporaryFile from typing import Any, cast +import cv2 import numpy as np from PIL import Image from transformers import PreTrainedTokenizerBase @@ -498,9 +500,13 @@ class RandomDataset(BenchmarkDataset): num_requests, range_ratio, input_len, output_len, tokenizer ) - # Generate prefix once - prefix_token_ids = self.get_prefix(tokenizer, prefix_len) vocab_size = tokenizer.vocab_size + prohibited_tokens = tokenizer.all_special_ids + all_tokens = np.arange(vocab_size) + allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens))) + + # Generate prefix once + prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len) requests = [] token_mismatch_total = 0 @@ -513,6 +519,7 @@ class RandomDataset(BenchmarkDataset): input_len=int(input_lens[i]), offset=int(offsets[i]), index=i, + allowed_tokens=allowed_tokens, ) token_mismatch_total += token_mismatch requests.append( @@ -553,13 +560,17 @@ class RandomDataset(BenchmarkDataset): return requests def get_prefix( - self, tokenizer: PreTrainedTokenizerBase, prefix_len: int + self, + allowed_tokens: np.ndarray, + prefix_len: int, ) -> list[int]: """ Get the prefix for the dataset. """ return ( - self._rng.integers(0, tokenizer.vocab_size, size=prefix_len).tolist() + allowed_tokens[ + self._rng.integers(0, len(allowed_tokens), size=prefix_len) + ].tolist() if prefix_len > 0 else [] ) @@ -623,6 +634,7 @@ class RandomDataset(BenchmarkDataset): input_len: int, offset: int, index: int, + allowed_tokens: np.ndarray, ) -> tuple[str, int, int]: """ Returns (prompt, total_input_len). @@ -636,8 +648,11 @@ class RandomDataset(BenchmarkDataset): To avoid uncontrolled change of the prompt length, the encoded sequence is truncated before being decoded again. """ - # Build the inner sequence by sampling sequentially from the vocab - inner_seq = ((offset + index + np.arange(input_len)) % vocab_size).tolist() + # Build the inner sequence by sampling + # sequentially from the allowed tokens + inner_seq = allowed_tokens[ + (offset + index + np.arange(input_len)) % len(allowed_tokens) + ].tolist() token_sequence = prefix_token_ids + inner_seq # Decode, then re-encode and truncate to preserve token count invariants @@ -772,7 +787,7 @@ class RandomMultiModalDataset(RandomDataset): Status: - Images: supported via synthetic RGB data. - - Video: not yet supported (TODO: implement video generation method). + - Video: supported via synthetic RGB data. - Audio: not yet supported. Sampling overview: @@ -782,7 +797,7 @@ class RandomMultiModalDataset(RandomDataset): The maximum is further clamped to the sum of per-modality limits. 2) Each item’s modality and shape is sampled from `bucket_config`, a dict mapping (height, width, num_frames) → probability. We treat - `num_frames`=1 as image and and `num_frames` > 1 as video. + `num_frames`=1 as image and `num_frames` > 1 as video. Entries with zero probability are removed and the rest are renormalized to sum to 1. 3) Per-modality hard caps are enforced via `limit_mm_per_prompt`. @@ -797,8 +812,7 @@ class RandomMultiModalDataset(RandomDataset): """ IS_MULTIMODAL = True - # NOTE: video sampling is WIP. Setting it to 0. - DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 0} + DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 1} DEFAULT_BASE_ITEMS_PER_REQUEST = 1 DEFAULT_NUM_MM_ITEMS_RANGE_RATIO = 0.0 @@ -828,12 +842,47 @@ class RandomMultiModalDataset(RandomDataset): ) return Image.fromarray(random_pixels) - def generate_synthetic_video(self, width: int, height: int, num_frames: int) -> Any: + def generate_synthetic_video( + self, width: int, height: int, num_frames: int + ) -> dict: """Generate synthetic video with random values. - TODO: Finish this method. + Creates a video with random pixel values, encodes it to MP4 format, + and returns the content as bytes. """ - raise NotImplementedError("Video sampling is WIP.") + random_pixels = self._rng.integers( + 0, + 256, + (num_frames, height, width, 3), + dtype=np.uint8, + ) + + # Create a temporary video file in memory + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + fps = 30 # frames per second + + with NamedTemporaryFile(suffix=".mp4", delete_on_close=False) as temp_file: + temp_path = temp_file.name + + # Create video writer + video_writer = cv2.VideoWriter( + temp_path, fourcc=fourcc, fps=fps, frameSize=(width, height) + ) + + if not video_writer.isOpened(): + raise RuntimeError("Failed to create video writer") + + for frame in random_pixels: + video_writer.write(frame) + + video_writer.release() + temp_file.close() + + # Read the video file content + with open(temp_path, "rb") as f: + video_content = f.read() + + return {"bytes": video_content} def map_config_to_modality(self, config: tuple[int, int, int]) -> str: """Map the configuration to the modality.""" @@ -1044,16 +1093,6 @@ class RandomMultiModalDataset(RandomDataset): enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT, **kwargs, ) -> list[SampleRequest]: - # NOTE: Video sampling is WIP. Raise error if video is in bucket config - # and probability is non-zero. - if any( - self.map_config_to_modality(cfg) == "video" and p > 0 - for cfg, p in bucket_config.items() - ): - raise NotImplementedError( - "Video sampling not implemented; set its probability to 0." - ) - # Get the sampling parameters for the dataset input_lens, output_lens, offsets = self.get_sampling_params( num_requests, range_ratio, input_len, output_len, tokenizer @@ -1071,9 +1110,24 @@ class RandomMultiModalDataset(RandomDataset): bucket_config, ) - # Generate prefix once - prefix_token_ids = self.get_prefix(tokenizer, prefix_len) vocab_size = tokenizer.vocab_size + # Can't use tokenizer.all_special_ids since + # it returns ONLY ids from special_tokens_map.json + # We want to exclude placeholder tokens and all + # tokens that indicate start/end of image as it + # may break prompt replacement logic. + prohibited_tokens = list( + tok_id + for tok_id, token in tokenizer.added_tokens_decoder.items() + if token.special + ) + all_tokens = np.arange(vocab_size) + allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens))) + logger.debug( + "Sampling from %d out of %d (vocab size)", len(allowed_tokens), vocab_size + ) + # Generate prefix once + prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len) # Add synthetic multimodal items to each request mm_requests = [] token_mismatch_total = 0 @@ -1086,6 +1140,7 @@ class RandomMultiModalDataset(RandomDataset): input_len=int(input_lens[i]), offset=int(offsets[i]), index=i, + allowed_tokens=allowed_tokens, ) token_mismatch_total += token_mismatch # Get multimodal item iterator for a given request diff --git a/vllm/benchmarks/sweep/cli.py b/vllm/benchmarks/sweep/cli.py new file mode 100644 index 0000000000..108cd75690 --- /dev/null +++ b/vllm/benchmarks/sweep/cli.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG + +from .plot import SweepPlotArgs +from .plot import main as plot_main +from .serve import SweepServeArgs +from .serve import main as serve_main +from .serve_sla import SweepServeSLAArgs +from .serve_sla import main as serve_sla_main + +SUBCOMMANDS = ( + (SweepServeArgs, serve_main), + (SweepServeSLAArgs, serve_sla_main), + (SweepPlotArgs, plot_main), +) + + +def add_cli_args(parser: argparse.ArgumentParser): + subparsers = parser.add_subparsers(required=True, dest="sweep_type") + + for cmd, entrypoint in SUBCOMMANDS: + cmd_subparser = subparsers.add_parser( + cmd.parser_name, + description=cmd.parser_help, + usage=f"vllm bench sweep {cmd.parser_name} [options]", + ) + cmd_subparser.set_defaults(dispatch_function=entrypoint) + cmd.add_cli_args(cmd_subparser) + cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format( + subcmd=f"sweep {cmd.parser_name}" + ) + + +def main(args: argparse.Namespace): + args.dispatch_function(args) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 92485c09b4..9947d6170d 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -8,16 +8,24 @@ from dataclasses import dataclass from functools import partial from pathlib import Path from types import TracebackType +from typing import ClassVar -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns from typing_extensions import Self, override from vllm.utils.collection_utils import full_groupby +from vllm.utils.import_utils import PlaceholderModule from .utils import sanitize_filename +try: + import matplotlib.pyplot as plt + import pandas as pd + import seaborn as sns +except ImportError: + plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot") + pd = PlaceholderModule("pandas") + seaborn = PlaceholderModule("seaborn") + @dataclass class PlotFilterBase(ABC): @@ -40,7 +48,7 @@ class PlotFilterBase(ABC): ) @abstractmethod - def apply(self, df: pd.DataFrame) -> pd.DataFrame: + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": """Applies this filter to a DataFrame.""" raise NotImplementedError @@ -48,7 +56,7 @@ class PlotFilterBase(ABC): @dataclass class PlotEqualTo(PlotFilterBase): @override - def apply(self, df: pd.DataFrame) -> pd.DataFrame: + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": try: target = float(self.target) except ValueError: @@ -60,28 +68,28 @@ class PlotEqualTo(PlotFilterBase): @dataclass class PlotLessThan(PlotFilterBase): @override - def apply(self, df: pd.DataFrame) -> pd.DataFrame: + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": return df[df[self.var] < float(self.target)] @dataclass class PlotLessThanOrEqualTo(PlotFilterBase): @override - def apply(self, df: pd.DataFrame) -> pd.DataFrame: + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": return df[df[self.var] <= float(self.target)] @dataclass class PlotGreaterThan(PlotFilterBase): @override - def apply(self, df: pd.DataFrame) -> pd.DataFrame: + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": return df[df[self.var] > float(self.target)] @dataclass class PlotGreaterThanOrEqualTo(PlotFilterBase): @override - def apply(self, df: pd.DataFrame) -> pd.DataFrame: + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": return df[df[self.var] >= float(self.target)] @@ -103,7 +111,7 @@ class PlotFilters(list[PlotFilterBase]): return cls(PlotFilterBase.parse_str(e) for e in s.split(",")) - def apply(self, df: pd.DataFrame) -> pd.DataFrame: + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": for item in self: df = item.apply(df) @@ -127,7 +135,7 @@ class PlotBinner: f"Valid operators are: {sorted(PLOT_BINNERS)}", ) - def apply(self, df: pd.DataFrame) -> pd.DataFrame: + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": """Applies this binner to a DataFrame.""" df = df.copy() df[self.var] = df[self.var] // self.bin_size * self.bin_size @@ -147,7 +155,7 @@ class PlotBinners(list[PlotBinner]): return cls(PlotBinner.parse_str(e) for e in s.split(",")) - def apply(self, df: pd.DataFrame) -> pd.DataFrame: + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": for item in self: df = item.apply(df) @@ -396,135 +404,177 @@ def plot( ) -def add_cli_args(parser: argparse.ArgumentParser): - parser.add_argument( - "OUTPUT_DIR", - type=str, - default="results", - help="The directory containing the results to plot, " - "i.e., the `--output-dir` argument to the parameter sweep script.", - ) - parser.add_argument( - "--fig-dir", - type=str, - default="", - help="The directory to save the figures, relative to `OUTPUT_DIR`. " - "By default, the same directory is used.", - ) - parser.add_argument( - "--fig-by", - type=str, - default="", - help="A comma-separated list of variables, such that a separate figure " - "is created for each combination of these variables.", - ) - parser.add_argument( - "--row-by", - type=str, - default="", - help="A comma-separated list of variables, such that a separate row " - "is created for each combination of these variables.", - ) - parser.add_argument( - "--col-by", - type=str, - default="", - help="A comma-separated list of variables, such that a separate column " - "is created for each combination of these variables.", - ) - parser.add_argument( - "--curve-by", - type=str, - default=None, - help="A comma-separated list of variables, such that a separate curve " - "is created for each combination of these variables.", - ) - parser.add_argument( - "--var-x", - type=str, - default="request_throughput", - help="The variable for the x-axis.", - ) - parser.add_argument( - "--var-y", - type=str, - default="p99_e2el_ms", - help="The variable for the y-axis", - ) - parser.add_argument( - "--filter-by", - type=str, - default="", - help="A comma-separated list of statements indicating values to filter by. " - "This is useful to remove outliers. " - "Example: `max_concurrency<1000,max_num_batched_tokens<=4096` means " - "plot only the points where `max_concurrency` is less than 1000 and " - "`max_num_batched_tokens` is no greater than 4096.", - ) - parser.add_argument( - "--bin-by", - type=str, - default="", - help="A comma-separated list of statements indicating values to bin by. " - "This is useful to avoid plotting points that are too close together. " - "Example: `request_throughput%1` means " - "use a bin size of 1 for the `request_throughput` variable.", - ) - parser.add_argument( - "--scale-x", - type=str, - default=None, - help="The scale to use for the x-axis. " - "Currently only accepts string values such as 'log' and 'sqrt'. " - "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html", - ) - parser.add_argument( - "--scale-y", - type=str, - default=None, - help="The scale to use for the y-axis. " - "Currently only accepts string values such as 'log' and 'sqrt'. " - "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="If set, prints the information about each figure to plot, " - "then exits without drawing them.", - ) +@dataclass +class SweepPlotArgs: + output_dir: Path + fig_dir: Path + fig_by: list[str] + row_by: list[str] + col_by: list[str] + curve_by: list[str] + var_x: str + var_y: str + filter_by: PlotFilters + bin_by: PlotBinners + scale_x: str | None + scale_y: str | None + dry_run: bool + + parser_name: ClassVar[str] = "plot" + parser_help: ClassVar[str] = "Plot performance curves from parameter sweep results." + + @classmethod + def from_cli_args(cls, args: argparse.Namespace): + output_dir = Path(args.OUTPUT_DIR) + if not output_dir.exists(): + raise ValueError(f"No parameter sweep results under {output_dir}") + + curve_by = [] if not args.curve_by else args.curve_by.split(",") + row_by = [] if not args.row_by else args.row_by.split(",") + col_by = [] if not args.col_by else args.col_by.split(",") + fig_by = [] if not args.fig_by else args.fig_by.split(",") + + return cls( + output_dir=output_dir, + fig_dir=output_dir / args.fig_dir, + fig_by=fig_by, + row_by=row_by, + col_by=col_by, + curve_by=curve_by, + var_x=args.var_x, + var_y=args.var_y, + filter_by=PlotFilters.parse_str(args.filter_by), + bin_by=PlotBinners.parse_str(args.bin_by), + scale_x=args.scale_x, + scale_y=args.scale_y, + dry_run=args.dry_run, + ) + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser.add_argument( + "OUTPUT_DIR", + type=str, + default="results", + help="The directory containing the results to plot, " + "i.e., the `--output-dir` argument to the parameter sweep script.", + ) + parser.add_argument( + "--fig-dir", + type=str, + default="", + help="The directory to save the figures, relative to `OUTPUT_DIR`. " + "By default, the same directory is used.", + ) + parser.add_argument( + "--fig-by", + type=str, + default="", + help="A comma-separated list of variables, such that a separate figure " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--row-by", + type=str, + default="", + help="A comma-separated list of variables, such that a separate row " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--col-by", + type=str, + default="", + help="A comma-separated list of variables, such that a separate column " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--curve-by", + type=str, + default=None, + help="A comma-separated list of variables, such that a separate curve " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--var-x", + type=str, + default="request_throughput", + help="The variable for the x-axis.", + ) + parser.add_argument( + "--var-y", + type=str, + default="p99_e2el_ms", + help="The variable for the y-axis", + ) + parser.add_argument( + "--filter-by", + type=str, + default="", + help="A comma-separated list of statements indicating values to filter by. " + "This is useful to remove outliers. " + "Example: `max_concurrency<1000,max_num_batched_tokens<=4096` means " + "plot only the points where `max_concurrency` is less than 1000 and " + "`max_num_batched_tokens` is no greater than 4096.", + ) + parser.add_argument( + "--bin-by", + type=str, + default="", + help="A comma-separated list of statements indicating values to bin by. " + "This is useful to avoid plotting points that are too close together. " + "Example: `request_throughput%%1` means " + "use a bin size of 1 for the `request_throughput` variable.", + ) + parser.add_argument( + "--scale-x", + type=str, + default=None, + help="The scale to use for the x-axis. " + "Currently only accepts string values such as 'log' and 'sqrt'. " + "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html", + ) + parser.add_argument( + "--scale-y", + type=str, + default=None, + help="The scale to use for the y-axis. " + "Currently only accepts string values such as 'log' and 'sqrt'. " + "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="If set, prints the information about each figure to plot, " + "then exits without drawing them.", + ) + + return parser -def main(args: argparse.Namespace): - output_dir = Path(args.OUTPUT_DIR) - if not output_dir.exists(): - raise ValueError(f"No parameter sweep results under {output_dir}") - - curve_by = [] if not args.curve_by else args.curve_by.split(",") - row_by = [] if not args.row_by else args.row_by.split(",") - col_by = [] if not args.col_by else args.col_by.split(",") - fig_by = [] if not args.fig_by else args.fig_by.split(",") - - plot( - output_dir=output_dir, - fig_dir=output_dir / args.fig_dir, - fig_by=fig_by, - row_by=row_by, - col_by=col_by, - curve_by=curve_by, +def run_main(args: SweepPlotArgs): + return plot( + output_dir=args.output_dir, + fig_dir=args.fig_dir, + fig_by=args.fig_by, + row_by=args.row_by, + col_by=args.col_by, + curve_by=args.curve_by, var_x=args.var_x, var_y=args.var_y, - filter_by=PlotFilters.parse_str(args.filter_by), - bin_by=PlotBinners.parse_str(args.bin_by), + filter_by=args.filter_by, + bin_by=args.bin_by, scale_x=args.scale_x, scale_y=args.scale_y, dry_run=args.dry_run, ) +def main(args: argparse.Namespace): + run_main(SweepPlotArgs.from_cli_args(args)) + + if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Plot performance curves from parameter sweep results." - ) - add_cli_args(parser) + parser = argparse.ArgumentParser(description=SweepPlotArgs.parser_help) + SweepPlotArgs.add_cli_args(parser) main(parser.parse_args()) diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index a06d4d6d60..45ac446a7a 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -7,13 +7,19 @@ import shlex from dataclasses import dataclass from datetime import datetime from pathlib import Path +from typing import ClassVar -import pandas as pd +from vllm.utils.import_utils import PlaceholderModule from .param_sweep import ParameterSweep, ParameterSweepItem from .server import ServerProcess from .utils import sanitize_filename +try: + import pandas as pd +except ImportError: + pd = PlaceholderModule("pandas") + @contextlib.contextmanager def run_server( @@ -257,6 +263,9 @@ class SweepServeArgs: dry_run: bool resume: str | None + parser_name: ClassVar[str] = "serve" + parser_help: ClassVar[str] = "Run vLLM server benchmark under multiple settings." + @classmethod def from_cli_args(cls, args: argparse.Namespace): serve_cmd = shlex.split(args.serve_cmd) @@ -401,9 +410,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Run vLLM server benchmark under multiple settings." - ) + parser = argparse.ArgumentParser(description=SweepServeArgs.parser_help) SweepServeArgs.add_cli_args(parser) main(parser.parse_args()) diff --git a/vllm/benchmarks/sweep/serve_sla.py b/vllm/benchmarks/sweep/serve_sla.py index 6159aba4bb..0403d1ddfd 100644 --- a/vllm/benchmarks/sweep/serve_sla.py +++ b/vllm/benchmarks/sweep/serve_sla.py @@ -7,17 +7,23 @@ import math from dataclasses import asdict, dataclass from datetime import datetime from pathlib import Path -from typing import Literal, get_args +from typing import ClassVar, Literal, get_args -import pandas as pd from typing_extensions import assert_never +from vllm.utils.import_utils import PlaceholderModule + from .param_sweep import ParameterSweep, ParameterSweepItem from .serve import SweepServeArgs, run_benchmark, run_server from .server import ServerProcess from .sla_sweep import SLASweep, SLASweepItem from .utils import sanitize_filename +try: + import pandas as pd +except ImportError: + pd = PlaceholderModule("pandas") + def _get_sla_base_path( output_dir: Path, @@ -399,6 +405,9 @@ class SweepServeSLAArgs(SweepServeArgs): sla_params: SLASweep sla_variable: SLAVariable + parser_name: ClassVar[str] = "serve_sla" + parser_help: ClassVar[str] = "Tune a variable to meet SLAs under multiple settings." + @classmethod def from_cli_args(cls, args: argparse.Namespace): # NOTE: Don't use super() as `from_cli_args` calls `cls()` @@ -419,7 +428,8 @@ class SweepServeSLAArgs(SweepServeArgs): def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser = super().add_cli_args(parser) - parser.add_argument( + sla_group = parser.add_argument_group("sla options") + sla_group.add_argument( "--sla-params", type=str, required=True, @@ -431,7 +441,7 @@ class SweepServeSLAArgs(SweepServeArgs): "the maximum `sla_variable` that satisfies the constraints for " "each combination of `serve_params`, `bench_params`, and `sla_params`.", ) - parser.add_argument( + sla_group.add_argument( "--sla-variable", type=str, choices=get_args(SLAVariable), @@ -476,9 +486,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Tune a variable to meet SLAs under multiple settings." - ) + parser = argparse.ArgumentParser(description=SweepServeSLAArgs.parser_help) SweepServeSLAArgs.add_cli_args(parser) main(parser.parse_args()) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 69fb93601f..0946fa6917 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -18,7 +18,12 @@ from torch._dynamo.symbolic_convert import InliningInstructionTranslator import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationMode, VllmConfig, set_current_vllm_config +from vllm.config import ( + CompilationMode, + VllmConfig, + get_current_vllm_config, + set_current_vllm_config, +) from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils.import_utils import resolve_obj_by_qualname @@ -74,6 +79,21 @@ def support_torch_compile( ) -> Callable[[_T], _T]: ... +@overload +def support_torch_compile( + *, + mark_unbacked_dims: dict[str, int | list[int]] | None, +) -> Callable[[_T], _T]: ... + + +@overload +def support_torch_compile( + *, + dynamic_arg_dims: dict[str, int | list[int]] | None, + mark_unbacked_dims: dict[str, int | list[int]] | None, +) -> Callable[[_T], _T]: ... + + @overload def support_torch_compile(cls: _T) -> _T: ... @@ -82,6 +102,7 @@ def support_torch_compile( cls: _T | None = None, *, dynamic_arg_dims: dict[str, int | list[int]] | None = None, + mark_unbacked_dims: dict[str, int | list[int]] | None = None, enable_if: Callable[[VllmConfig], bool] | None = None, ) -> Callable[[_T], _T] | _T: """ @@ -135,6 +156,11 @@ def support_torch_compile( returns a boolean value indicating whether to compile the model or not. This is useful if you want to compile the model only when certain conditions are met. + + `mark_unbacked_dims` is a dictionary that maps argument names with a dynamic + dim to be decorated with `mark_unbacked`. This is useful if we would like to + enforce that dynamo do not specialize on 0/1 values in the case of dummy input + such as for vision model compilation """ def cls_decorator_helper(cls: _T) -> _T: @@ -172,7 +198,9 @@ def support_torch_compile( raise ValueError( f"Argument {k} not found in the forward method of {cls}" ) - return _support_torch_compile(cls, inferred_dynamic_arg_dims, enable_if) + return _support_torch_compile( + cls, inferred_dynamic_arg_dims, mark_unbacked_dims, enable_if + ) if cls is not None: # use `support_torch_compile` as a decorator without arguments @@ -212,6 +240,7 @@ def _verify_source_unchanged(source_info, vllm_config) -> None: def _support_torch_compile( cls: _T, dynamic_arg_dims: dict[str, int | list[int]], + mark_unbacked_dims: dict[str, int | list[int]] | None = None, enable_if: Callable[[VllmConfig], bool] | None = None, ) -> _T: """ @@ -230,8 +259,22 @@ def _support_torch_compile( setattr(cls, IGNORE_COMPILE_KEY, False) - def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs): - old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) + def __init__( + self, *, vllm_config: VllmConfig | None = None, prefix: str = "", **kwargs + ): + if vllm_config is None: + vllm_config = get_current_vllm_config() + + # NOTE: to support multimodal models (such as encoder), + # we may not have vllm_config so we may need to patch + # it + sig = inspect.signature(old_init) + if "vllm_config" in sig.parameters: + kwargs["vllm_config"] = vllm_config + if "prefix" in sig.parameters: + kwargs["prefix"] = prefix + old_init(self, **kwargs) + self.vllm_config = vllm_config enable_compile = enable_if is None or enable_if(vllm_config) # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner @@ -344,6 +387,15 @@ def _support_torch_compile( "Unsupported dynamic dimensions" f" {dims} for argument {k} with type {type(arg)}." ) + if mark_unbacked_dims: + for k, dims in mark_unbacked_dims.items(): + arg = bound_args.arguments.get(k) + if arg is not None: + dims = [dims] if isinstance(dims, int) else dims + if isinstance(arg, torch.Tensor): + # In case dims is specified with negative indexing + dims = [arg.ndim + dim if dim < 0 else dim for dim in dims] + torch._dynamo.decorators.mark_unbacked(arg, dims) # here, it is the starting point of the `torch.compile` process start_monitoring_torch_compile(self.vllm_config) logger.debug("Start compiling function %s", self.original_code_object) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 1734f6b15d..d743d5aa9d 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -5,7 +5,7 @@ import hashlib from dataclasses import field from typing import TYPE_CHECKING, Any, Literal -from pydantic import Field, SkipValidation, field_validator, model_validator +from pydantic import Field, SkipValidation, field_validator from pydantic.dataclasses import dataclass from vllm.config.utils import config @@ -185,11 +185,3 @@ class CacheConfig: raise ValueError("Too large swap space. " + msg) elif cpu_memory_usage > 0.4 * total_cpu_memory: logger.warning("Possibly too large swap space. %s", msg) - - @model_validator(mode="after") - def validate_mamba_block_size(self) -> "CacheConfig": - if self.mamba_block_size is not None and not self.enable_prefix_caching: - raise ValueError( - "--mamba-block-size can only be set with --enable-prefix-caching" - ) - return self diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index c24a94091b..6a5bd5ef4e 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -453,6 +453,7 @@ class CompilationConfig: "vllm::linear_attention", "vllm::plamo2_mamba_mixer", "vllm::gdn_attention", + "vllm::kda_attention", "vllm::sparse_attn_indexer", ] @@ -684,6 +685,8 @@ class CompilationConfig: from vllm.compilation.backends import VllmBackend + # TODO[@lucaskabela]: See if we can forward prefix + # https://github.com/vllm-project/vllm/issues/27045 return VllmBackend(vllm_config) def post_init_cudagraph_sizes(self) -> None: diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 2f9d638542..84e92eef40 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -9,7 +9,6 @@ from pydantic import ConfigDict, Field, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self -import vllm.envs as envs from vllm.config.utils import config from vllm.logger import init_logger from vllm.platforms import current_platform @@ -106,10 +105,6 @@ class LoRAConfig: return self - def verify_with_cache_config(self, cache_config: CacheConfig): - if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1: - raise ValueError("V0 LoRA does not support CPU offload, please use V1.") - def verify_with_model_config(self, model_config: ModelConfig): if self.lora_dtype in (None, "auto"): self.lora_dtype = model_config.dtype diff --git a/vllm/config/model.py b/vllm/config/model.py index e22c218c76..092c67e7be 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -32,7 +32,6 @@ from vllm.transformers_utils.config import ( get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - is_interleaved, try_get_dense_modules, try_get_generation_config, try_get_safetensors_metadata, @@ -442,15 +441,12 @@ class ModelConfig: self.enforce_eager = True # Set the default seed to 0 in V1. - # NOTE(woosuk): In V0, we set the default seed to None because the - # driver worker shares the same process as the user process, and thus - # setting a seed affects the user process as well. - # In V1, we use separate processes for workers (unless + # NOTE(woosuk): In V1, we use separate processes for workers (unless # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here # doesn't affect the user process. However, without a consistent seed, # different tensor parallel workers would sample different tokens, # leading to inconsistent results. - if envs.VLLM_USE_V1 and self.seed is None: + if self.seed is None: self.seed = 0 if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: logger.warning( @@ -703,23 +699,6 @@ class ModelConfig: revision=self.revision, ) - # Interleaved attention is not supported by some backends in V0 - if ( - not self.disable_sliding_window - and is_interleaved(self.hf_text_config) - and not envs.VLLM_USE_V1 - and (backend := envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER") - ): - logger.warning_once( - "%s has interleaved attention, which is currently not " - "supported by the %s backend. Disabling sliding window and " - "capping the max length to the sliding window size (%d).", - self.hf_text_config.model_type, - backend, - self.hf_text_config.sliding_window, - ) - self.disable_sliding_window = True - self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) # Init multimodal config if needed @@ -1257,6 +1236,7 @@ class ModelConfig: "deepseek_v32", "deepseek_mtp", "kimi_k2", + "kimi_linear", "longcat_flash", ): return self.hf_text_config.kv_lora_rank is not None diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index e8847354bb..82d575f246 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -521,15 +521,11 @@ class ParallelConfig: current_platform.is_cuda() and cuda_device_count_stateless() < self.world_size ): - if not ray_found: - raise ValueError( - "Unable to load Ray: " - f"{ray_utils.ray_import_err}. Ray is " - "required for multi-node inference, " - "please install Ray with `pip install " - "ray`." - ) - backend = "ray" + gpu_count = cuda_device_count_stateless() + raise ValueError( + f"Tensor parallel size ({self.world_size}) cannot be " + f"larger than the number of available GPUs ({gpu_count})." + ) elif self.data_parallel_backend == "ray": logger.info( "Using ray distributed inference because " diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index 0590f74aa4..6bece8d078 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -7,6 +7,9 @@ from typing import Any from pydantic.dataclasses import dataclass from vllm.config.utils import config +from vllm.logger import init_logger + +logger = init_logger(__name__) @config @@ -48,7 +51,15 @@ class PoolerConfig: """ ## for classification models - activation: bool | None = None + softmax: float | None = None + """ + softmax will be deprecated, please use use_activation instead. + """ + activation: float | None = None + """ + activation will be deprecated, please use use_activation instead. + """ + use_activation: bool | None = None """ Whether to apply activation function to the classification outputs. Defaults to True. @@ -59,11 +70,6 @@ class PoolerConfig: """ ## for reward models - softmax: bool | None = None - """ - Whether to apply softmax to the reward outputs. - Defaults to True. - """ step_tag_id: int | None = None """ If set, only the score corresponding to the `step_tag_id` in the @@ -77,6 +83,10 @@ class PoolerConfig: `math-shepherd-mistral-7b-prm` model. """ + def __post_init__(self): + # raise deprecated warning for softmax and activation + self.use_activation = get_use_activation(self) + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -94,3 +104,19 @@ class PoolerConfig: factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str + + +def get_use_activation(o: object): + if softmax := getattr(o, "softmax", None) is not None: + logger.warning_once( + "softmax will be deprecated, please use use_activation instead." + ) + return softmax + + if activation := getattr(o, "activation", None) is not None: + logger.warning_once( + "activation will be deprecated, please use use_activation instead." + ) + return activation + + return getattr(o, "use_activation", None) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 4c7b7369ed..903b9a26fa 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -9,7 +9,6 @@ from pydantic import Field, SkipValidation, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self -import vllm.envs as envs from vllm.config.parallel import ParallelConfig from vllm.config.utils import config from vllm.logger import init_logger @@ -366,12 +365,6 @@ class SpeculativeConfig: # Replace hf_config for EAGLE draft_model if self.method in ("eagle", "eagle3"): - if self.enable_chunked_prefill and not envs.VLLM_USE_V1: - raise ValueError( - "Chunked prefill and EAGLE are not compatible " - "when using V0." - ) - from vllm.transformers_utils.configs import SpeculatorsConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 5e7e7580c5..7e0878d96b 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -33,7 +33,7 @@ def config(cls: ConfigT) -> ConfigT: `pydantic.TypeAdapter(ConfigT).validate_json(cli_arg)` which treats the `cli_arg` as a JSON string which gets validated by `pydantic`. - Config validation is performed by the tools/validate_config.py + Config validation is performed by the tools/pre_commit/validate_config.py script, which is invoked during the pre-commit checks. """ return cls diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index a7f7f3b45a..1acac70c32 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -17,7 +17,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar import torch -from pydantic import ConfigDict, Field +from pydantic import ConfigDict, Field, model_validator from pydantic.dataclasses import dataclass import vllm.envs as envs @@ -84,7 +84,9 @@ class VllmConfig: default_factory=StructuredOutputsConfig ) """Structured outputs configuration.""" - observability_config: ObservabilityConfig | None = None + observability_config: ObservabilityConfig = Field( + default_factory=ObservabilityConfig + ) """Observability configuration.""" quant_config: QuantizationConfig | None = None """Quantization configuration.""" @@ -130,7 +132,6 @@ class VllmConfig: from vllm import __version__ vllm_factors.append(__version__) - vllm_factors.append(envs.VLLM_USE_V1) if self.model_config: vllm_factors.append(self.model_config.compute_hash()) else: @@ -171,10 +172,7 @@ class VllmConfig: vllm_factors.append(self.structured_outputs_config.compute_hash()) else: vllm_factors.append("None") - if self.observability_config: - vllm_factors.append(self.observability_config.compute_hash()) - else: - vllm_factors.append("None") + vllm_factors.append(self.observability_config.compute_hash()) if self.quant_config: pass # should be captured by model_config.quantization if self.compilation_config: @@ -306,7 +304,6 @@ class VllmConfig: self.cache_config.verify_with_parallel_config(self.parallel_config) if self.lora_config is not None: - self.lora_config.verify_with_cache_config(self.cache_config) self.lora_config.verify_with_model_config(self.model_config) if self.quant_config is None and self.model_config is not None: @@ -332,18 +329,9 @@ class VllmConfig: # we use the default mode. The default mode depends on other # settings (see the below code). if self.compilation_config.mode is None: - if envs.VLLM_USE_V1: - if ( - self.model_config is not None - and not self.model_config.enforce_eager - ): - self.compilation_config.mode = CompilationMode.VLLM_COMPILE - else: - self.compilation_config.mode = CompilationMode.NONE - + if self.model_config is not None and not self.model_config.enforce_eager: + self.compilation_config.mode = CompilationMode.VLLM_COMPILE else: - # NB: Passing both --enforce-eager and a compilation mode - # in V0 means the compilation mode wins out. self.compilation_config.mode = CompilationMode.NONE else: assert self.compilation_config.mode >= CompilationMode.NONE @@ -371,10 +359,7 @@ class VllmConfig: # if cudagraph_mode is not explicitly set by users, set default # value if self.compilation_config.cudagraph_mode is None: - if ( - envs.VLLM_USE_V1 - and self.compilation_config.mode == CompilationMode.VLLM_COMPILE - ): + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: # default to full and piecewise for most models self.compilation_config.cudagraph_mode = ( CUDAGraphMode.FULL_AND_PIECEWISE @@ -428,7 +413,7 @@ class VllmConfig: # override related settings when enforce eager self.compilation_config.max_cudagraph_capture_size = 0 self.compilation_config.cudagraph_capture_sizes = [] - elif envs.VLLM_USE_V1: + else: self.compilation_config.cudagraph_num_of_warmups = 1 self._set_cudagraph_sizes() @@ -535,14 +520,11 @@ class VllmConfig: current_platform.check_and_update_config(self) # Do this after all the updates to compilation_config.mode - if ( - envs.VLLM_USE_V1 - and self.compilation_config.mode == CompilationMode.VLLM_COMPILE - ): + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: self.compilation_config.set_splitting_ops_for_v1() # final check of cudagraph mode after all possible updates - if envs.VLLM_USE_V1 and current_platform.is_cuda_alike(): + if current_platform.is_cuda_alike(): if ( self.compilation_config.cudagraph_mode.has_full_cudagraphs() and self.model_config is not None @@ -587,10 +569,7 @@ class VllmConfig: if not self.instance_id: self.instance_id = random_uuid()[:5] - if ( - envs.VLLM_USE_V1 - and not self.scheduler_config.disable_hybrid_kv_cache_manager - ): + if not self.scheduler_config.disable_hybrid_kv_cache_manager: # logger should only print warning message for hybrid models. As we # can't know whether the model is hybrid or not now, so we don't log # warning message here and will log it later. @@ -943,6 +922,20 @@ class VllmConfig: f"compilation_config={self.compilation_config!r}" ) + @model_validator(mode="after") + def validate_mamba_block_size(self) -> "VllmConfig": + if self.model_config is None: + return self + mamba_block_size_is_set = ( + self.cache_config.mamba_block_size is not None + and self.cache_config.mamba_block_size != self.model_config.max_model_len + ) + if mamba_block_size_is_set and not self.cache_config.enable_prefix_caching: + raise ValueError( + "--mamba-block-size can only be set with --enable-prefix-caching" + ) + return self + _current_vllm_config: VllmConfig | None = None _current_prefix: str | None = None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 2562eb9ce7..2ed0fe592e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -50,7 +50,12 @@ if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_events import KVCacheEvent - from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats + from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorPromMetrics, + KVConnectorStats, + PromMetric, + PromMetricT, + ) from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.request import Request @@ -471,3 +476,18 @@ class KVConnectorBase_V1(ABC): which can implement custom aggregation logic on the data dict. """ return None + + @classmethod + def build_prom_metrics( + cls, + vllm_config: "VllmConfig", + metric_types: dict[type["PromMetric"], type["PromMetricT"]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ) -> Optional["KVConnectorPromMetrics"]: + """ + Create a KVConnectorPromMetrics subclass which should register + per-connector Prometheus metrics and implement observe() to + expose connector transfer stats via Prometheus. + """ + return None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index 3f60fbd645..ad907c75a2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -44,8 +44,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils impo ) from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group from vllm.sampling_params import SamplingParams -from vllm.utils import get_kv_cache_torch_dtype from vllm.utils.math_utils import cdiv +from vllm.utils.torch_utils import get_kv_cache_torch_dtype from vllm.v1.core.sched.output import SchedulerOutput from vllm.version import __version__ as VLLM_VERSION @@ -389,7 +389,7 @@ class ReqMeta: def need_gpu_interm_buffer(lmcache_config: LMCacheEngineConfig): - return lmcache_config.enable_pd + return not lmcache_config.enable_pd def _calculate_mtp_layers(vllm_config, model_config): @@ -403,6 +403,20 @@ def _calculate_mtp_layers(vllm_config, model_config): num_mtp_layers = getattr( model_config.hf_config, "num_nextn_predict_layers", 0 ) + + elif vllm_config.speculative_config.use_eagle(): + try: + draft_model_config = vllm_config.speculative_config.draft_model_config + num_mtp_layers = draft_model_config.get_num_layers( + vllm_config.parallel_config + ) + logger.info("EAGLE detected %d extra layer(s)", num_mtp_layers) + except Exception: + logger.info( + "EAGLE detected, but failed to get the number of extra layers" + "falling back to 1" + ) + num_mtp_layers = 1 return num_mtp_layers @@ -1208,6 +1222,10 @@ class LMCacheConnectorV1Impl: if the CacheManager this allocated blocks for us. """ + # Clear local status in lookup client when a new request is + # successfully scheduled. + self.lookup_client.clear_lookup_status(request.request_id) + kv_transfer_params = ( request.kv_transfer_params if hasattr(request, "kv_transfer_params") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py index 21002fe572..d6ea4f1ab4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py @@ -1,13 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass, field -from typing import Any +from typing import Any, TypeAlias, TypeVar -from vllm.config.kv_transfer import KVTransferConfig +from prometheus_client import Counter, Gauge, Histogram + +from vllm.config import KVTransferConfig, VllmConfig from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.distributed.kv_transfer.kv_transfer_state import has_kv_transfer_group from vllm.logger import init_logger +PromMetric: TypeAlias = Gauge | Counter | Histogram +PromMetricT = TypeVar("PromMetricT", bound=PromMetric) + logger = init_logger(__name__) @@ -102,3 +107,83 @@ class KVConnectorLogging: # Reset metrics for next interval self.reset() + + +class KVConnectorPromMetrics: + """ + A base class for per-connector Prometheus metric registration + and recording. + """ + + def __init__( + self, + vllm_config: VllmConfig, + metric_types: dict[type[PromMetric], type[PromMetricT]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ): + self._kv_transfer_config = vllm_config.kv_transfer_config + self._gauge_cls = metric_types[Gauge] + self._counter_cls = metric_types[Counter] + self._histogram_cls = metric_types[Histogram] + self._labelnames = labelnames + self._per_engine_labelvalues = per_engine_labelvalues + + def make_per_engine(self, metric: PromMetric) -> PromMetric: + """ + Create a per-engine child of a prometheus_client.Metric with + the appropriate labels set. The parent metric must be created + using the labelnames list. + """ + return { + idx: metric.labels(*labelvalues) + for idx, labelvalues in self._per_engine_labelvalues.items() + } + + def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0): + """ + Record the supplied transfer statistics to Prometheus metrics. These + statistics are engine-specific, and should be recorded to a metric + with the appropriate 'engine' label. These metric instances can be + created using the make_per_engine() helper method. + """ + raise NotImplementedError + + +class KVConnectorPrometheus: + """ + Support for registering per-connector Prometheus metrics, and + recording transfer statistics to those metrics. Uses + KVConnectorBase.build_prom_metrics(). + """ + + _gauge_cls = Gauge + _counter_cls = Counter + _histogram_cls = Histogram + + def __init__( + self, + vllm_config: VllmConfig, + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ): + self.prom_metrics: KVConnectorPromMetrics | None = None + kv_transfer_config = vllm_config.kv_transfer_config + if kv_transfer_config and kv_transfer_config.kv_connector: + connector_cls = KVConnectorFactory.get_connector_class(kv_transfer_config) + metric_types = { + Gauge: self._gauge_cls, + Counter: self._counter_cls, + Histogram: self._histogram_cls, + } + self.prom_metrics = connector_cls.build_prom_metrics( + vllm_config, + metric_types, + labelnames, + per_engine_labelvalues, + ) + + def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0): + if self.prom_metrics is None: + return + self.prom_metrics.observe(transfer_stats_data, engine_idx) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index c1a2ac0124..d56f30bd11 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -9,13 +9,19 @@ import torch from vllm.config import VllmConfig from vllm.config.kv_transfer import KVTransferConfig +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole, ) -from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorPromMetrics, + KVConnectorStats, + PromMetric, + PromMetricT, +) from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput @@ -72,6 +78,27 @@ class MultiKVConnectorStats(KVConnectorStats): self.data[connector_id] = stats +class MultiKVConnectorPromMetrics(KVConnectorPromMetrics): + def __init__( + self, + vllm_config: "VllmConfig", + metric_types: dict[type[PromMetric], type[PromMetricT]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + prom_metrics: dict[str, KVConnectorPromMetrics], + ): + super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues) + self._prom_metrics = prom_metrics + + def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0): + for connector_id, stats_data in transfer_stats_data.items(): + assert connector_id in self._prom_metrics, ( + f"{connector_id} is not contained in the list of registered connectors " + f"with Prometheus metrics support: {self._prom_metrics.keys()}" + ) + self._prom_metrics[connector_id].observe(stats_data["data"], engine_idx) + + class MultiConnector(KVConnectorBase_V1): """ A wrapper for using multiple KVConnectors at the same time. @@ -84,19 +111,13 @@ class MultiConnector(KVConnectorBase_V1): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) + self._connectors: list[KVConnectorBase_V1] = [] self._ktc_kv_transfer_config = [] - ktcs = self._kv_transfer_config.kv_connector_extra_config.get("connectors") - assert ktcs is not None - for ktc in ktcs: - temp_config = copy.copy(vllm_config) - engine_id = ktc.get("engine_id", self._kv_transfer_config.engine_id) - temp_config.kv_transfer_config = KVTransferConfig( - **ktc, engine_id=engine_id - ) - self._connectors.append( - KVConnectorFactory.create_connector(temp_config, role) - ) + for connector_cls, temp_config in self._get_connector_classes_and_configs( + vllm_config + ): + self._connectors.append(connector_cls(temp_config, role)) self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config) # A mapping from request id to the index of the connector chosen to @@ -109,6 +130,32 @@ class MultiConnector(KVConnectorBase_V1): # Propagated from scheduler to worker side via the connector metadata. self._extra_async_saves: dict[str, int] = {} + @classmethod + def _get_connector_classes_and_configs( + cls, vllm_config: "VllmConfig" + ) -> list[tuple[type[KVConnectorBaseType], "VllmConfig"]]: + assert vllm_config.kv_transfer_config is not None + ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( + "connectors" + ) + assert ktcs is not None + ret: list[tuple[type[KVConnectorBaseType], VllmConfig]] = [] + for ktc in ktcs: + temp_config = copy.copy(vllm_config) + engine_id = ktc.get("engine_id", vllm_config.kv_transfer_config.engine_id) + temp_config.kv_transfer_config = KVTransferConfig( + **ktc, engine_id=engine_id + ) + ret.append( + ( + KVConnectorFactory.get_connector_class( + temp_config.kv_transfer_config + ), + temp_config, + ) + ) + return ret + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): for c in self._connectors: c.register_kv_caches(kv_caches) @@ -295,18 +342,12 @@ class MultiConnector(KVConnectorBase_V1): None if the connector does not require a specific layout. """ assert vllm_config.kv_transfer_config is not None - ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( - "connectors" - ) - assert ktcs is not None layouts: set[str] = set() - temp_vllm_config = copy.copy(vllm_config) - for ktc in ktcs: - kv_transfer_config = KVTransferConfig(**ktc) - temp_vllm_config.kv_transfer_config = kv_transfer_config - connector_cls = KVConnectorFactory.get_connector_class(kv_transfer_config) + for connector_cls, temp_config in cls._get_connector_classes_and_configs( + vllm_config + ): required_kvcache_layout = connector_cls.get_required_kvcache_layout( - temp_vllm_config + temp_config ) if required_kvcache_layout is not None: layouts.add(required_kvcache_layout) @@ -372,3 +413,28 @@ class MultiConnector(KVConnectorBase_V1): stats_by_connector = MultiKVConnectorStats() stats_by_connector[c.__class__.__name__] = stats return stats_by_connector + + @classmethod + def build_prom_metrics( + cls, + vllm_config: "VllmConfig", + metric_types: dict[type["PromMetric"], type["PromMetricT"]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ) -> KVConnectorPromMetrics: + prom_metrics: dict[str, KVConnectorPromMetrics] = {} + for connector_cls, temp_config in cls._get_connector_classes_and_configs( + vllm_config + ): + connector_prom = connector_cls.build_prom_metrics( + temp_config, metric_types, labelnames, per_engine_labelvalues + ) + if connector_prom is not None: + prom_metrics[connector_cls.__name__] = connector_prom + return MultiKVConnectorPromMetrics( + vllm_config, + metric_types, + labelnames, + per_engine_labelvalues, + prom_metrics, + ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 72fcb5cd5b..275a8c7340 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -30,7 +30,12 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorMetadata, KVConnectorRole, ) -from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorPromMetrics, + KVConnectorStats, + PromMetric, + PromMetricT, +) from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -254,6 +259,18 @@ class NixlConnector(KVConnectorBase_V1): else NixlKVConnectorStats() ) + @classmethod + def build_prom_metrics( + cls, + vllm_config: VllmConfig, + metric_types: dict[type[PromMetric], type[PromMetricT]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ) -> KVConnectorPromMetrics: + return NixlPromMetrics( + vllm_config, metric_types, labelnames, per_engine_labelvalues + ) + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: assert self.connector_worker is not None assert isinstance(self._connector_metadata, NixlConnectorMetadata) @@ -1960,3 +1977,125 @@ class NixlKVConnectorStats(KVConnectorStats): @property def num_successful_transfers(self) -> int: return len(self.data["transfer_duration"]) + + +class NixlPromMetrics(KVConnectorPromMetrics): + def __init__( + self, + vllm_config: VllmConfig, + metric_types: dict[type[PromMetric], type[PromMetricT]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ): + super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues) + + buckets = [ + 0.001, + 0.005, + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.2, + 0.3, + 0.5, + 0.75, + 1.0, + 5.0, + ] + nixl_histogram_xfer_time = self._histogram_cls( + name="vllm:nixl_xfer_time_seconds", + documentation="Histogram of transfer duration for NIXL KV Cache transfers.", + buckets=buckets[1:], + labelnames=labelnames, + ) + self.nixl_histogram_xfer_time = self.make_per_engine(nixl_histogram_xfer_time) + nixl_histogram_post_time = self._histogram_cls( + name="vllm:nixl_post_time_seconds", + documentation="Histogram of transfer post time for NIXL KV" + " Cache transfers.", + buckets=buckets, + labelnames=labelnames, + ) + self.nixl_histogram_post_time = self.make_per_engine(nixl_histogram_post_time) + # uniform 2kb to 16gb range + buckets = [2 ** (10 + i) for i in range(1, 25, 2)] + nixl_histogram_bytes_transferred = self._histogram_cls( + name="vllm:nixl_bytes_transferred", + documentation="Histogram of bytes transferred per NIXL KV Cache transfers.", + buckets=buckets, + labelnames=labelnames, + ) + self.nixl_histogram_bytes_transferred = self.make_per_engine( + nixl_histogram_bytes_transferred + ) + buckets = [ + 10, + 20, + 30, + 50, + 75, + 100, + 200, + 400, + 1000, + 2000, + 4000, + 10000, + 20000, + 50000, + ] + nixl_histogram_num_descriptors = self._histogram_cls( + name="vllm:nixl_num_descriptors", + documentation="Histogram of number of descriptors per NIXL" + " KV Cache transfers.", + buckets=buckets, + labelnames=labelnames, + ) + self.nixl_histogram_num_descriptors = self.make_per_engine( + nixl_histogram_num_descriptors + ) + counter_nixl_num_failed_transfers = self._counter_cls( + name="vllm:nixl_num_failed_transfers", + documentation="Number of failed NIXL KV Cache transfers.", + labelnames=labelnames, + ) + self.counter_nixl_num_failed_transfers = self.make_per_engine( + counter_nixl_num_failed_transfers + ) + counter_nixl_num_failed_notifications = self._counter_cls( + name="vllm:nixl_num_failed_notifications", + documentation="Number of failed NIXL KV Cache notifications.", + labelnames=labelnames, + ) + self.counter_nixl_num_failed_notifications = self.make_per_engine( + counter_nixl_num_failed_notifications + ) + + def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0): + for prom_obj, list_item_key in zip( + [ + self.nixl_histogram_xfer_time, + self.nixl_histogram_post_time, + self.nixl_histogram_bytes_transferred, + self.nixl_histogram_num_descriptors, + ], + [ + "transfer_duration", + "post_duration", + "bytes_transferred", + "num_descriptors", + ], + ): + for list_item in transfer_stats_data[list_item_key]: + prom_obj[engine_idx].observe(list_item) + for counter_obj, counter_item_key in zip( + [ + self.counter_nixl_num_failed_transfers, + self.counter_nixl_num_failed_notifications, + ], + ["num_failed_transfers", "num_failed_notifications"], + ): + for list_item in transfer_stats_data[counter_item_key]: + counter_obj[engine_idx].inc(list_item) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 6d4ffc152d..19344e5784 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -494,5 +494,5 @@ def yield_req_data( yield from zip( cached_reqs.req_ids, cached_reqs.new_block_ids, - cached_reqs.resumed_from_preemption, + (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids), ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index e47cde2614..780dd12fcc 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -415,10 +415,10 @@ class P2pNcclConnector(KVConnectorBase_V1): for i, req_id in enumerate(cached_reqs.req_ids): num_computed_tokens = cached_reqs.num_computed_tokens[i] new_block_ids = cached_reqs.new_block_ids[i] - resumed_from_preemption = cached_reqs.resumed_from_preemption[i] + resumed_from_preemption = req_id in cached_reqs.resumed_req_ids if self.is_producer: - num_scheduled_tokens = (scheduler_output.num_scheduled_tokens)[req_id] + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id] num_tokens = num_scheduled_tokens + num_computed_tokens assert req_id in self.chunked_prefill assert new_block_ids is not None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 3ef287817c..0e748db666 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json import logging import os import threading @@ -96,19 +97,30 @@ class P2pNcclEngine: # Each card corresponds to a ZMQ address. self.zmq_address = f"{self._hostname}:{self._port}" - # The `http_port` must be consistent with the port of OpenAI. - self.http_address = ( - f"{self._hostname}:{self.config.kv_connector_extra_config['http_port']}" - ) - # If `proxy_ip` or `proxy_port` is `""`, # then the ping thread will not be enabled. proxy_ip = self.config.get_from_extra_config("proxy_ip", "") proxy_port = self.config.get_from_extra_config("proxy_port", "") if proxy_ip == "" or proxy_port == "": self.proxy_address = "" + self.http_address = "" else: self.proxy_address = proxy_ip + ":" + proxy_port + # the `http_port` must be consistent with the port of OpenAI. + http_port = self.config.get_from_extra_config("http_port", None) + if http_port is None: + example_cfg = { + "kv_connector": "P2pNcclConnector", + "kv_connector_extra_config": {"http_port": 8000}, + } + example = ( + f"--port=8000 --kv-transfer-config='{json.dumps(example_cfg)}'" + ) + raise ValueError( + "kv_connector_extra_config.http_port is required. " + f"Example: {example}" + ) + self.http_address = f"{self._hostname}:{http_port}" self.context = zmq.Context() self.router_socket = self.context.socket(zmq.ROUTER) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index d0cd4b07c5..9c230d7d0d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -336,36 +336,34 @@ class SharedStorageConnector(KVConnectorBase_V1): cached_reqs = scheduler_output.scheduled_cached_reqs for i, req_id in enumerate(cached_reqs.req_ids): + resumed_from_preemption = req_id in cached_reqs.resumed_req_ids + if not resumed_from_preemption or req_id not in self._requests_need_load: + continue + num_computed_tokens = cached_reqs.num_computed_tokens[i] num_new_tokens = scheduler_output.num_scheduled_tokens[req_id] new_block_ids = cached_reqs.new_block_ids[i] - resumed_from_preemption = cached_reqs.resumed_from_preemption[i] - # NOTE(rob): here we rely on the resumed requests being - # the first N requests in the list scheduled_cache_reqs. - if not resumed_from_preemption: - break - if req_id in self._requests_need_load: - # NOTE(rob): cached_req_data does not have the full - # list of token ids (only new tokens). So we look it - # up in the actual request object. - request = self._requests_need_load[req_id] - total_tokens = num_computed_tokens + num_new_tokens - token_ids = request.all_token_ids[:total_tokens] + # NOTE(rob): cached_req_data does not have the full + # list of token ids (only new tokens). So we look it + # up in the actual request object. + request = self._requests_need_load[req_id] + total_tokens = num_computed_tokens + num_new_tokens + token_ids = request.all_token_ids[:total_tokens] - # NOTE(rob): For resumed req, new_block_ids is all - # of the block_ids for the request. - assert new_block_ids is not None - block_ids = new_block_ids[0] + # NOTE(rob): For resumed req, new_block_ids is all + # of the block_ids for the request. + assert new_block_ids is not None + block_ids = new_block_ids[0] - meta.add_request( - token_ids=token_ids, - block_ids=block_ids, - block_size=self._block_size, - is_store=False, - mm_hashes=[f.identifier for f in request.mm_features], - ) - total_need_load += 1 + meta.add_request( + token_ids=token_ids, + block_ids=block_ids, + block_size=self._block_size, + is_store=False, + mm_hashes=[f.identifier for f in request.mm_features], + ) + total_need_load += 1 assert total_need_load == len(self._requests_need_load) self._requests_need_load.clear() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e8f8e3f8c2..b31e4931f2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -75,13 +75,12 @@ from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized from vllm.reasoning import ReasoningParserManager -from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.config import ( get_model_path, is_interleaved, maybe_override_with_speculators, ) -from vllm.transformers_utils.utils import check_gguf_file +from vllm.transformers_utils.utils import check_gguf_file, is_s3 from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GiB_bytes from vllm.utils.network_utils import get_ip @@ -1126,15 +1125,6 @@ class EngineArgs: if check_gguf_file(self.model): self.quantization = self.load_format = "gguf" - # NOTE: This is to allow model loading from S3 in CI - if ( - not isinstance(self, AsyncEngineArgs) - and envs.VLLM_CI_USE_S3 - and self.model in MODELS_ON_S3 - and self.load_format == "auto" - ): - self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}" - if self.disable_mm_preprocessor_cache: logger.warning( "`--disable-mm-preprocessor-cache` is deprecated " @@ -1305,20 +1295,26 @@ class EngineArgs: device_config = DeviceConfig(device=cast(Device, current_platform.device_type)) + # Check if the model is a speculator and override model/tokenizer/config + # BEFORE creating ModelConfig, so the config is created with the target model + # Skip speculator detection for S3 models since HuggingFace cannot load + # configs directly from S3 URLs. S3 models can still use speculators with + # explicit --speculative-config. + if not is_s3(self.model): + (self.model, self.tokenizer, self.speculative_config) = ( + maybe_override_with_speculators( + model=self.model, + tokenizer=self.tokenizer, + revision=self.revision, + trust_remote_code=self.trust_remote_code, + vllm_speculative_config=self.speculative_config, + ) + ) + model_config = self.create_model_config() self.model = model_config.model self.tokenizer = model_config.tokenizer - (self.model, self.tokenizer, self.speculative_config) = ( - maybe_override_with_speculators( - model=self.model, - tokenizer=self.tokenizer, - revision=self.revision, - trust_remote_code=self.trust_remote_code, - vllm_speculative_config=self.speculative_config, - ) - ) - # * If VLLM_USE_V1 is unset, we enable V1 for "supported features" # and fall back to V0 for experimental or unsupported features. # * If VLLM_USE_V1=1, we enable V1 for supported + experimental diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 959a034281..24fcd9fe1c 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -77,6 +77,7 @@ class EngineClient(ABC): lora_request: LoRARequest | None = None, trace_headers: Mapping[str, str] | None = None, priority: int = 0, + truncate_prompt_tokens: int | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: """Generate outputs for a request from a pooling model.""" diff --git a/vllm/entrypoints/cli/__init__.py b/vllm/entrypoints/cli/__init__.py index 211e157fc7..9dff68236f 100644 --- a/vllm/entrypoints/cli/__init__.py +++ b/vllm/entrypoints/cli/__init__.py @@ -2,10 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand +from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcommand __all__: list[str] = [ "BenchmarkLatencySubcommand", "BenchmarkServingSubcommand", + "BenchmarkSweepSubcommand", "BenchmarkThroughputSubcommand", ] diff --git a/vllm/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py index 3263459fd6..d8543822cf 100644 --- a/vllm/entrypoints/cli/benchmark/base.py +++ b/vllm/entrypoints/cli/benchmark/base.py @@ -6,7 +6,7 @@ from vllm.entrypoints.cli.types import CLISubcommand class BenchmarkSubcommandBase(CLISubcommand): - """The base class of subcommands for vllm bench.""" + """The base class of subcommands for `vllm bench`.""" help: str diff --git a/vllm/entrypoints/cli/benchmark/latency.py b/vllm/entrypoints/cli/benchmark/latency.py index 548ddf4d60..60f2b03341 100644 --- a/vllm/entrypoints/cli/benchmark/latency.py +++ b/vllm/entrypoints/cli/benchmark/latency.py @@ -7,7 +7,7 @@ from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase class BenchmarkLatencySubcommand(BenchmarkSubcommandBase): - """The `latency` subcommand for vllm bench.""" + """The `latency` subcommand for `vllm bench`.""" name = "latency" help = "Benchmark the latency of a single batch of requests." diff --git a/vllm/entrypoints/cli/benchmark/serve.py b/vllm/entrypoints/cli/benchmark/serve.py index b085f52afb..6616305c74 100644 --- a/vllm/entrypoints/cli/benchmark/serve.py +++ b/vllm/entrypoints/cli/benchmark/serve.py @@ -7,7 +7,7 @@ from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase class BenchmarkServingSubcommand(BenchmarkSubcommandBase): - """The `serve` subcommand for vllm bench.""" + """The `serve` subcommand for `vllm bench`.""" name = "serve" help = "Benchmark the online serving throughput." diff --git a/vllm/entrypoints/cli/benchmark/sweep.py b/vllm/entrypoints/cli/benchmark/sweep.py new file mode 100644 index 0000000000..c385207690 --- /dev/null +++ b/vllm/entrypoints/cli/benchmark/sweep.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +from vllm.benchmarks.sweep.cli import add_cli_args, main +from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase + + +class BenchmarkSweepSubcommand(BenchmarkSubcommandBase): + """The `sweep` subcommand for `vllm bench`.""" + + name = "sweep" + help = "Benchmark for a parameter sweep." + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: + add_cli_args(parser) + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + main(args) diff --git a/vllm/entrypoints/cli/benchmark/throughput.py b/vllm/entrypoints/cli/benchmark/throughput.py index c25be75ec1..2097f9ea07 100644 --- a/vllm/entrypoints/cli/benchmark/throughput.py +++ b/vllm/entrypoints/cli/benchmark/throughput.py @@ -7,7 +7,7 @@ from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase): - """The `throughput` subcommand for vllm bench.""" + """The `throughput` subcommand for `vllm bench`.""" name = "throughput" help = "Benchmark offline inference throughput." diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 8886d7c42d..0041db8220 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -11,6 +11,7 @@ from typing import TYPE_CHECKING, Union from openai.types.responses.tool import Mcp from openai_harmony import Author, Message, Role, StreamState, TextContent +from vllm import envs from vllm.entrypoints.harmony_utils import ( get_encoding, get_streamable_parser_for_assistant, @@ -109,6 +110,28 @@ class ConversationContext(ABC): raise NotImplementedError("Should not be called.") +def _create_json_parse_error_messages( + last_msg: Message, e: json.JSONDecodeError +) -> list[Message]: + """ + Creates an error message when json parse failed. + """ + error_msg = ( + f"Error parsing tool arguments as JSON: {str(e)}. " + "Please ensure the tool call arguments are valid JSON and try again." + ) + content = TextContent(text=error_msg) + author = Author(role=Role.TOOL, name=last_msg.recipient) + return [ + Message( + author=author, + content=[content], + recipient=Role.ASSISTANT, + channel=last_msg.channel, + ) + ] + + class SimpleContext(ConversationContext): def __init__(self): self.last_output = None @@ -339,7 +362,13 @@ class HarmonyContext(ConversationContext): if isinstance(tool_session, Tool): return await tool_session.get_result(self) tool_name = last_msg.recipient.split(".")[1] - args = json.loads(last_msg.content[0].text) + if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: + try: + args = json.loads(last_msg.content[0].text) + except json.JSONDecodeError as e: + return _create_json_parse_error_messages(last_msg, e) + else: + args = json.loads(last_msg.content[0].text) result = await tool_session.call_tool(tool_name, args) result_str = result.content[0].text content = TextContent(text=result_str) @@ -420,7 +449,13 @@ class HarmonyContext(ConversationContext): if isinstance(tool_session, Tool): return await tool_session.get_result(self) tool_name = last_msg.recipient.split(".")[1].split(" ")[0] - args = json.loads(last_msg.content[0].text) + if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: + try: + args = json.loads(last_msg.content[0].text) + except json.JSONDecodeError as e: + return _create_json_parse_error_messages(last_msg, e) + else: + args = json.loads(last_msg.content[0].text) result = await tool_session.call_tool(tool_name, args) result_str = result.content[0].text content = TextContent(text=result_str) diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index 97f95a97ee..7958d03177 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -61,15 +61,19 @@ _harmony_encoding = None # they are available and requested by the user. # Tool args are provided by MCP tool descriptions. Output # of the tools are stringified. -BUILTIN_TOOLS = { +MCP_BUILTIN_TOOLS: set[str] = { "web_search_preview", "code_interpreter", "container", } -def has_custom_tools(tool_types: list[str]) -> bool: - return not set(tool_types).issubset(BUILTIN_TOOLS) +def has_custom_tools(tool_types: set[str]) -> bool: + """ + Checks if the given tool types are custom tools + (i.e. any tool other than MCP buildin tools) + """ + return not tool_types.issubset(MCP_BUILTIN_TOOLS) def get_encoding(): @@ -340,7 +344,24 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: if len(message.content) != 1: raise ValueError("Invalid number of contents in browser message") content = message.content[0] - browser_call = json.loads(content.text) + # We do not need to check the VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY + # env variable since if it is not set, we are certain the json is valid + # The use of Actions for web search will be removed entirely in + # the future, so this is only necessary temporarily + try: + browser_call = json.loads(content.text) + except json.JSONDecodeError: + # If the content is not valid JSON, then it was + # caught and retried by vLLM, which means we + # need to make note of that so the user is aware + json_retry_output_message = ( + f"Invalid JSON args, caught and retried: {content.text}" + ) + browser_call = { + "query": json_retry_output_message, + "url": json_retry_output_message, + "pattern": json_retry_output_message, + } # TODO: translate to url properly! if recipient == "browser.search": action = ActionSearch( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 71939d6c41..f3aa5351e5 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -107,6 +107,7 @@ from vllm.entrypoints.utils import ( ) from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager +from vllm.tasks import POOLING_TASKS from vllm.usage.usage_lib import UsageContext from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.network_utils import is_valid_ipv6_address @@ -1748,12 +1749,7 @@ async def init_app_state( log_error_stack=args.log_error_stack, ) ) - if ( - any( - task in supported_tasks - for task in ["token_embed", "token_classify", "plugin"] - ) - ) + if any(task in POOLING_TASKS for task in supported_tasks) else None ) state.openai_serving_embedding = ( diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 0778e4d787..d0061f9d5b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -49,6 +49,8 @@ from openai.types.responses.response_reasoning_item import ( ) from openai_harmony import Message as OpenAIHarmonyMessage +from vllm.config.pooler import get_use_activation +from vllm.tasks import PoolingTask from vllm.utils.serial_utils import ( EmbedDType, EncodingFormat, @@ -1669,8 +1671,58 @@ class EmbeddingChatRequest(OpenAIBaseModel): EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest -PoolingCompletionRequest = EmbeddingCompletionRequest -PoolingChatRequest = EmbeddingChatRequest + +class PoolingCompletionRequest(EmbeddingCompletionRequest): + task: PoolingTask | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "If it is a classify or token_classify task, the default is True; " + "for other tasks, this value should be None.", + ) + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize, + use_activation=get_use_activation(self), + ) + + +class PoolingChatRequest(EmbeddingChatRequest): + task: PoolingTask | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "If it is a classify or token_classify task, the default is True; " + "for other tasks, this value should be None.", + ) + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize, + use_activation=get_use_activation(self), + ) + T = TypeVar("T") @@ -1686,6 +1738,7 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]): """ data: T + task: PoolingTask = "plugin" encoding_format: EncodingFormat = "float" embed_dtype: EmbedDType = Field( default="float32", @@ -1749,14 +1802,27 @@ class ScoreRequest(OpenAIBaseModel): ), ) - activation: bool | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) # --8<-- [end:score-extra-params] def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - activation=self.activation, + use_activation=get_use_activation(self), ) @@ -1783,14 +1849,27 @@ class RerankRequest(OpenAIBaseModel): ), ) - activation: bool | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) # --8<-- [end:rerank-extra-params] def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - activation=self.activation, + use_activation=get_use_activation(self), ) @@ -1958,14 +2037,27 @@ class ClassificationRequest(OpenAIBaseModel): ), ) - activation: bool | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) # --8<-- [end:classification-extra-params] def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - activation=self.activation, + use_activation=get_use_activation(self), ) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 568896ccbf..0eade27211 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -170,15 +170,24 @@ class OpenAIServingPooling(OpenAIServing): pooling_params = request.to_pooling_params() pooling_task: PoolingTask - if "token_embed" in self.supported_tasks: - pooling_task = "token_embed" - elif "token_classify" in self.supported_tasks: - pooling_task = "token_classify" - elif "plugin" in self.supported_tasks: - pooling_task = "plugin" + if request.task is None: + if "token_embed" in self.supported_tasks: + pooling_task = "token_embed" + elif "token_classify" in self.supported_tasks: + pooling_task = "token_classify" + elif "plugin" in self.supported_tasks: + pooling_task = "plugin" + else: + return self.create_error_response( + f"pooling_task must be one of {self.supported_tasks}." + ) else: + pooling_task = request.task + + if pooling_task not in self.supported_tasks: return self.create_error_response( - f"pooling_task must be one of {self.supported_tasks}." + f"Task {pooling_task} is not supported, it" + f" must be one of {self.supported_tasks}." ) try: diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index d43bc00a49..2ee8de5fba 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -48,6 +48,7 @@ from openai.types.responses.response_output_text import Logprob, LogprobTopLogpr from openai.types.responses.response_reasoning_item import ( Content as ResponseReasoningTextContent, ) +from openai.types.responses.tool import Tool from openai_harmony import Message as OpenAIHarmonyMessage from vllm import envs @@ -106,6 +107,23 @@ from vllm.utils import random_uuid logger = init_logger(__name__) +def extract_tool_types(tools: list[Tool]) -> set[str]: + """ + Extracts the tool types from the given tools. + """ + tool_types: set[str] = set() + for tool in tools: + if tool.type == "mcp": + # Allow the MCP Tool type to enable built in tools if the + # server_label is allowlisted in + # envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS + if tool.server_label in envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: + tool_types.add(tool.server_label) + else: + tool_types.add(tool.type) + return tool_types + + class OpenAIServingResponses(OpenAIServing): def __init__( self, @@ -879,7 +897,7 @@ class OpenAIServingResponses(OpenAIServing): return messages def _construct_harmony_system_input_message( - self, request: ResponsesRequest, with_custom_tools: bool, tool_types: list[str] + self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str] ) -> OpenAIHarmonyMessage: reasoning_effort = request.reasoning.effort if request.reasoning else None enable_browser = ( @@ -927,17 +945,7 @@ class OpenAIServingResponses(OpenAIServing): messages: list[OpenAIHarmonyMessage] = [] if prev_response is None: # New conversation. - tool_types = [tool.type for tool in request.tools] - # Allow the MCP Tool type to enable built in tools if the - # server_label is allowlisted in - # envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS - if envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS: - for tool in request.tools: - if ( - tool.type == "mcp" - and tool.server_label in envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS - ): - tool_types.append(tool.server_label) + tool_types = extract_tool_types(request.tools) with_custom_tools = has_custom_tools(tool_types) sys_msg = self._construct_harmony_system_input_message( diff --git a/vllm/envs.py b/vllm/envs.py index 73bb2678ea..0548f01fc8 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -198,17 +198,18 @@ if TYPE_CHECKING: VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: str | None = None + VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set() VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False + VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024 VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE: bool = False - VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK: bool = False + VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK: bool = True VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL: bool = False VLLM_DBO_COMM_SMS: int = 20 - GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = [] VLLM_PATTERN_MATCH_DEBUG: str | None = None VLLM_DEBUG_DUMP_PATH: str | None = None VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE: bool = True @@ -246,10 +247,19 @@ def maybe_convert_bool(value: str | None) -> bool | None: return bool(int(value)) +def disable_compile_cache() -> bool: + return bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))) + + def use_aot_compile() -> bool: from vllm.utils.torch_utils import is_torch_equal_or_newer - default_value = "1" if is_torch_equal_or_newer("2.10.0.dev") else "0" + default_value = ( + "1" + if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache() + else "0" + ) + return os.environ.get("VLLM_USE_AOT_COMPILE", default_value) == "1" @@ -353,6 +363,24 @@ def env_list_with_choices( return _get_validated_env_list +def env_set_with_choices( + env_name: str, + default: list[str], + choices: list[str] | Callable[[], list[str]], + case_sensitive: bool = True, +) -> Callable[[], set[str]]: + """ + Creates a lambda which that validates environment variable + containing comma-separated values against allowed choices which + returns choices as a set. + """ + + def _get_validated_env_set() -> set[str]: + return set(env_list_with_choices(env_name, default, choices, case_sensitive)()) + + return _get_validated_env_set + + def get_vllm_port() -> int | None: """Get the port from VLLM_PORT environment variable. @@ -944,9 +972,7 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float( os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1") ), - "VLLM_DISABLE_COMPILE_CACHE": lambda: bool( - int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0")) - ), + "VLLM_DISABLE_COMPILE_CACHE": disable_compile_cache, # If set, vllm will run in development mode, which will enable # some additional endpoints for developing and debugging, # e.g. `/reset_prefix_cache` @@ -1327,10 +1353,25 @@ environment_variables: dict[str, Callable[[], Any]] = { ), # Allows vllm to find tuned config under customized folder "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), + # Valid values are container,code_interpreter,web_search_preview + # ex VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter + # If the server_label of your mcp tool is not in this list it will + # be completely ignored. + "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS": env_set_with_choices( + "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", + default=[], + choices=["container", "code_interpreter", "web_search_preview"], + ), # Allows harmony instructions to be injected on system messages "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": lambda: bool( int(os.getenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "0")) ), + # Enable automatic retry when tool call JSON parsing fails + # If enabled, returns an error message to the model to retry + # If disabled (default), raises an exception and fails the request + "VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY": lambda: bool( + int(os.getenv("VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY", "0")) + ), # Add optional custom scopes for profiling, disable to avoid overheads "VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool( int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0")) @@ -1362,7 +1403,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Allow DeepEP to use nvlink for internode_ll kernel, turn this on for # better latency on GB200 like system "VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK": lambda: bool( - int(os.getenv("VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK", "0")) + int(os.getenv("VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK", "1")) ), # Allow DeepEP to use MNNVL (multi-node nvlink) for internode_ll kernel, # turn this for better latency on GB200 like system @@ -1372,13 +1413,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # The number of SMs to allocate for communication kernels when running DBO # the rest of the SMs on the device will be allocated to compute "VLLM_DBO_COMM_SMS": lambda: int(os.getenv("VLLM_DBO_COMM_SMS", "20")), - # Valid values are container,code_interpreter,web_search_preview - # ex GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter - "GPT_OSS_SYSTEM_TOOL_MCP_LABELS": env_list_with_choices( - "GPT_OSS_SYSTEM_TOOL_MCP_LABELS", - [], - ["container", "code_interpreter", "web_search_preview"], - ), # Enable max_autotune & coordinate_descent_tuning in inductor_config # to compile static shapes passed from compile_sizes in compilation_config # If set to 1, enable max_autotune; By default, this is enabled (1) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 7368bfd35f..5706786bcc 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -478,9 +478,48 @@ def matmul_batch_invariant(a, b, *, out=None): elif a.ndim == 3 and b.ndim == 3: # Handle batched case like bmm return bmm_batch_invariant(a, b, out=out) + elif a.ndim == 3 and b.ndim == 2: + # Handle 3D x 2D: common for linear layers + # (batch, seq, hidden) @ (hidden, out) -> (batch, seq, out) + # Reshape to 2D, do mm, reshape back + batch, seq, hidden = a.shape + a_2d = a.reshape(-1, hidden) + result_2d = matmul_persistent(a_2d, b) + result = result_2d.reshape(batch, seq, -1) + if out is not None: + out.copy_(result) + return out + return result + elif a.ndim == 2 and b.ndim == 3: + # Handle 2D x 3D: (M, K) @ (B, K, N) -> (B, M, N) + # By broadcasting `a` to 3D, we can reuse the batched matrix + # multiplication logic. + a_expanded = a.unsqueeze(0).expand(b.shape[0], -1, -1) + return bmm_batch_invariant(a_expanded, b, out=out) + elif a.ndim == 4 and b.ndim == 4: + # Handle 4D attention tensors: [batch, heads, seq, dim] + # Reshape to 3D, process, reshape back + batch, heads, seq_a, dim_a = a.shape + _, _, dim_b, seq_b = b.shape + + # Reshape to [batch*heads, seq_a, dim_a] + a_3d = a.reshape(batch * heads, seq_a, dim_a) + b_3d = b.reshape(batch * heads, dim_b, seq_b) + + # Do batched matmul + result_3d = bmm_batch_invariant(a_3d, b_3d) + + # Reshape back to [batch, heads, seq_a, seq_b] + result = result_3d.reshape(batch, heads, seq_a, seq_b) + + if out is not None: + out.copy_(result) + return out + return result else: raise ValueError( - f"matmul_batch_invariant currently only supports 2D x 2D and 3D x 3D, " + f"matmul_batch_invariant currently only supports 2D x 2D, 3D x 3D, " + f"3D x 2D, 2D x 3D, and 4D x 4D, " f"got shapes {a.shape} and {b.shape}" ) @@ -667,7 +706,8 @@ def rms_norm_batch_invariant( def linear_batch_invariant(input, weight, bias=None): - output = mm_batch_invariant(input, weight.t()) + output = matmul_batch_invariant(input, weight.t()) + if bias is not None: output = output + bias return output @@ -753,13 +793,13 @@ def override_envs_for_invariance(): curr_attn_backend = envs.VLLM_ATTENTION_BACKEND supported_backends = [ "FLASH_ATTN", # best supported backend - "FLEX_ATTENTION", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA", # Not yet supported MLA backends # "FLASHMLA", + # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance ] if curr_attn_backend not in supported_backends: warning = ( diff --git a/vllm/model_executor/layers/fla/ops/kda.py b/vllm/model_executor/layers/fla/ops/kda.py index a10847d347..700f287ca4 100644 --- a/vllm/model_executor/layers/fla/ops/kda.py +++ b/vllm/model_executor/layers/fla/ops/kda.py @@ -1304,7 +1304,7 @@ def kda_gate_fwd_kernel( tl.store(y_ptr, b_y.to(y.dtype.element_ty), boundary_check=(0, 1)) -def kda_gate_fwd( +def fused_kda_gate( g: torch.Tensor, A: torch.Tensor, head_k_dim: int, diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 13866a5c5b..929cff7998 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -16,6 +16,7 @@ from vllm.utils.math_utils import round_up from vllm.v1.worker.ubatching import ( dbo_current_ubatch_id, dbo_enabled, + dbo_get_previous_event, dbo_switch_to_comm, dbo_switch_to_compute, dbo_switch_to_compute_sync, @@ -110,6 +111,10 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # for the other ubatch before the dispatch kernel starts. dbo_yield_and_switch_from_compute_to_comm() + # capture a DeepEP event and pass it as previous_event so + # DeepEP honors the dependency internally. + previous_event = dbo_get_previous_event(self.buffer.capture) + ( num_tokens_per_rank, num_tokens_per_rdma_rank, @@ -119,7 +124,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): ) = self.buffer.get_dispatch_layout( topk_idx=rank_topk_ids, num_experts=num_experts, - previous_event=None, + previous_event=previous_event, async_finish=False, allocate_on_comm_stream=False, ) @@ -148,7 +153,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # to this value. expert_alignment=1, config=self._get_dispatch_config(), - previous_event=None, + previous_event=previous_event, async_finish=self.async_prepare and not dbo_enabled(), allocate_on_comm_stream=False, ) @@ -339,13 +344,14 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): assert fused_expert_output.dtype == torch.bfloat16, ( f"Expected fused_expert_output bfloat16, got {fused_expert_output.dtype}" ) + previous_event = dbo_get_previous_event(self.buffer.capture) combined_x, _, event = self.buffer.combine( # HT combine only supports BF16 x=fused_expert_output, handle=handle, topk_weights=None, config=self._get_combine_config(), - previous_event=None, + previous_event=previous_event, async_finish=do_async and not dbo_enabled(), allocate_on_comm_stream=False, ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 294dddade6..7dbe4bc543 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1135,6 +1135,7 @@ class FusedMoE(CustomOp): ) self.global_num_experts = num_experts + num_redundant_experts + self.logical_num_experts = num_experts self.zero_expert_num = zero_expert_num self.zero_expert_type = zero_expert_type @@ -1998,13 +1999,12 @@ class FusedMoE(CustomOp): moe = self.moe_config - # Note here we use `num_experts` which is logical expert count if self.vllm_config.parallel_config.enable_dbo: states_shape = (2, moe.max_num_tokens, self.hidden_size) - logits_shape = (2, moe.max_num_tokens, moe.num_experts) + logits_shape = (2, moe.max_num_tokens, self.logical_num_experts) else: states_shape = (moe.max_num_tokens, self.hidden_size) - logits_shape = (moe.max_num_tokens, moe.num_experts) + logits_shape = (moe.max_num_tokens, self.logical_num_experts) self.batched_hidden_states = torch.zeros( states_shape, dtype=moe.in_dtype, device=torch.cuda.current_device() diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py new file mode 100644 index 0000000000..c45e7546fa --- /dev/null +++ b/vllm/model_executor/layers/kda.py @@ -0,0 +1,426 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +from einops import rearrange +from torch import nn + +from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config +from vllm.distributed import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.model_loader.weight_utils import sharded_weight_loader +from vllm.model_executor.utils import set_weight_attrs +from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata + +from .fla.ops.kda import ( + FusedRMSNormGated, + chunk_kda, + fused_kda_gate, + fused_recurrent_kda, +) +from .linear import ( + ColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) +from .mamba.abstract import MambaBase +from .mamba.mamba_utils import MambaStateDtypeCalculator, MambaStateShapeCalculator +from .mamba.ops.causal_conv1d import causal_conv1d_fn, causal_conv1d_update +from .quantization.base_config import QuantizationConfig + +logger = init_logger(__name__) + + +def kda_attention( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self._forward(hidden_states=hidden_states, output=output) + + +def kda_attention_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="kda_attention", + op_func=kda_attention, + mutates_args=["output"], + fake_impl=kda_attention_fake, +) + + +class KimiDeltaAttention(nn.Module, MambaBase): + @property + def mamba_type(self) -> str: + return "linear_attention" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend + + return GDNAttentionBackend + + def get_state_dtype( + self, + ) -> tuple[torch.dtype, torch.dtype, torch.dtype, torch.dtype]: + if self.model_config is None or self.cache_config is None: + raise ValueError("model_config and cache_config must be set") + return MambaStateDtypeCalculator.kda_state_dtype( + self.model_config.dtype, self.cache_config.mamba_cache_dtype + ) + + def get_state_shape( + self, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + return MambaStateShapeCalculator.kda_state_shape( + self.tp_size, self.num_heads, self.head_dim, conv_kernel_size=self.conv_size + ) + + def __init__( + self, + layer_idx: int, + hidden_size: int, + quant_config: QuantizationConfig | None = None, + cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, + rms_norm_eps: float = 1e-5, + prefix: str = "", + **kwargs, + ) -> None: + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.hidden_size = hidden_size + self.model_config = model_config + self.cache_config = cache_config + if model_config is None: + raise ValueError("model_config must be provided") + kda_config = model_config.linear_attn_config + self.head_dim = kda_config["head_dim"] + self.num_heads = kda_config["num_heads"] + self.layer_idx = layer_idx + self.prefix = prefix + assert self.num_heads % self.tp_size == 0 + self.local_num_heads = divide(self.num_heads, self.tp_size) + + projection_size = self.head_dim * self.num_heads + self.conv_size = kda_config["short_conv_kernel_size"] + + self.q_proj = ColumnParallelLinear( + self.hidden_size, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_proj", + ) + self.k_proj = ColumnParallelLinear( + self.hidden_size, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.k_proj", + ) + self.v_proj = ColumnParallelLinear( + self.hidden_size, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.v_proj", + ) + + self.f_a_proj = ReplicatedLinear( + self.hidden_size, + self.head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.f_a_proj", + ) + + self.f_b_proj = ColumnParallelLinear( + self.head_dim, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.f_b_proj", + ) + self.dt_bias = nn.Parameter( + torch.empty(divide(projection_size, self.tp_size), dtype=torch.float32) + ) + + set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)}) + + self.b_proj = ColumnParallelLinear( + self.hidden_size, + self.num_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.b_proj", + ) + + self.q_conv1d = ColumnParallelLinear( + input_size=self.conv_size, + output_size=projection_size, + bias=False, + params_dtype=torch.float32, + prefix=f"{prefix}.q_conv1d", + ) + self.k_conv1d = ColumnParallelLinear( + input_size=self.conv_size, + output_size=projection_size, + bias=False, + params_dtype=torch.float32, + prefix=f"{prefix}.k_conv1d", + ) + self.v_conv1d = ColumnParallelLinear( + input_size=self.conv_size, + output_size=projection_size, + bias=False, + params_dtype=torch.float32, + prefix=f"{prefix}.v_conv1d", + ) + # unsqueeze to fit conv1d weights shape into the linear weights shape. + # Can't do this in `weight_loader` since it already exists in + # `ColumnParallelLinear` and `set_weight_attrs` + # doesn't allow to override it + self.q_conv1d.weight.data = self.q_conv1d.weight.data.unsqueeze(1) + self.k_conv1d.weight.data = self.k_conv1d.weight.data.unsqueeze(1) + self.v_conv1d.weight.data = self.v_conv1d.weight.data.unsqueeze(1) + + self.A_log = nn.Parameter( + torch.empty(1, 1, self.local_num_heads, 1, dtype=torch.float32) + ) + set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(2)}) + + self.g_a_proj = ReplicatedLinear( + self.hidden_size, + self.head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.g_a_proj", + ) + self.g_b_proj = ColumnParallelLinear( + self.head_dim, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.g_b_proj", + ) + self.o_norm = FusedRMSNormGated( + self.head_dim, eps=rms_norm_eps, activation="sigmoid" + ) + self.o_proj = RowParallelLinear( + projection_size, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + def forward( + self, + hidden_states: torch.Tensor, + positions: torch.Tensor, + output: torch.Tensor, + ) -> None: + return torch.ops.vllm.kda_attention( + hidden_states, + output, + self.prefix, + ) + + def _forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + ) -> None: + forward_context = get_forward_context() + attn_metadata: AttentionMetadata = forward_context.attn_metadata + + if attn_metadata is None: + # V1 profile run + # Mimic the memory allocation in the real run + q = torch.empty_like(hidden_states) + k = torch.empty_like(hidden_states) + v = torch.empty_like(hidden_states) + g = hidden_states.new_empty( + hidden_states.size(0), + self.local_num_heads, + self.head_dim, + dtype=torch.float32, + ) + beta = torch.empty( + hidden_states.size(0), self.local_num_heads, dtype=torch.float32 + ) + core_attn_out = torch.empty_like(hidden_states) + return + + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, GDNAttentionMetadata) + has_initial_state = attn_metadata.has_initial_state + non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc + non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 + constant_caches = self.kv_cache[forward_context.virtual_engine] + + (conv_state_q, conv_state_k, conv_state_v, recurrent_state) = constant_caches + # deal with strides + conv_state_q = conv_state_q.transpose(-1, -2) + conv_state_k = conv_state_k.transpose(-1, -2) + conv_state_v = conv_state_v.transpose(-1, -2) + + q_proj_states = self.q_proj(hidden_states)[0] + k_proj_states = self.k_proj(hidden_states)[0] + v_proj_states = self.v_proj(hidden_states)[0] + + q_conv_weights = self.q_conv1d.weight.view( + self.q_conv1d.weight.size(0), self.q_conv1d.weight.size(2) + ) + k_conv_weights = self.k_conv1d.weight.view( + self.k_conv1d.weight.size(0), self.k_conv1d.weight.size(2) + ) + v_conv_weights = self.v_conv1d.weight.view( + self.v_conv1d.weight.size(0), self.v_conv1d.weight.size(2) + ) + if attn_metadata.num_prefills > 0: + q_proj_states = q_proj_states.transpose(0, 1) + k_proj_states = k_proj_states.transpose(0, 1) + v_proj_states = v_proj_states.transpose(0, 1) + q = causal_conv1d_fn( + q_proj_states, + q_conv_weights, + self.q_conv1d.bias, + activation="silu", + conv_states=conv_state_q, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + metadata=attn_metadata, + ).transpose(0, 1) + k = causal_conv1d_fn( + k_proj_states, + k_conv_weights, + self.k_conv1d.bias, + activation="silu", + conv_states=conv_state_k, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + metadata=attn_metadata, + ).transpose(0, 1) + v = causal_conv1d_fn( + v_proj_states, + v_conv_weights, + self.v_conv1d.bias, + activation="silu", + conv_states=conv_state_v, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + metadata=attn_metadata, + ).transpose(0, 1) + else: + decode_conv_indices = non_spec_state_indices_tensor[ + : attn_metadata.num_decodes + ] + q = causal_conv1d_update( + q_proj_states, + conv_state_q, + q_conv_weights, + self.q_conv1d.bias, + activation="silu", + conv_state_indices=decode_conv_indices, + validate_data=True, + ) + k = causal_conv1d_update( + k_proj_states, + conv_state_k, + k_conv_weights, + self.k_conv1d.bias, + activation="silu", + conv_state_indices=decode_conv_indices, + validate_data=True, + ) + v = causal_conv1d_update( + v_proj_states, + conv_state_v, + v_conv_weights, + self.v_conv1d.bias, + activation="silu", + conv_state_indices=decode_conv_indices, + validate_data=True, + ) + + q, k, v = map( + lambda x: rearrange(x, "n (h d) -> 1 n h d", d=self.head_dim), (q, k, v) + ) + + beta = self.b_proj(hidden_states)[0].float().sigmoid() + + g = self.f_b_proj(self.f_a_proj(hidden_states)[0])[0] + g = fused_kda_gate(g, self.A_log, self.head_dim, g_bias=self.dt_bias) + + beta = beta.unsqueeze(0) + g = g.unsqueeze(0) + + if attn_metadata.num_prefills > 0: + zero_idx = non_spec_state_indices_tensor[~has_initial_state] + recurrent_state[zero_idx] = 0 + initial_state = recurrent_state[non_spec_state_indices_tensor].contiguous() + ( + core_attn_out_non_spec, + last_recurrent_state, + ) = chunk_kda( + q=q, + k=k, + v=v, + g=g, + beta=beta, + initial_state=initial_state, + output_final_state=True, + use_qk_l2norm_in_kernel=True, + cu_seqlens=non_spec_query_start_loc, + ) + # Init cache + recurrent_state[non_spec_state_indices_tensor] = last_recurrent_state + else: + ( + core_attn_out_non_spec, + last_recurrent_state, + ) = fused_recurrent_kda( + q=q, + k=k, + v=v, + g=g, + beta=beta, + initial_state=recurrent_state, + use_qk_l2norm_in_kernel=True, + cu_seqlens=non_spec_query_start_loc, + ssm_state_indices=non_spec_state_indices_tensor, + ) + + g_proj_states = self.g_b_proj(self.g_a_proj(hidden_states)[0])[0] + g = rearrange(g_proj_states, "... (h d) -> ... h d", d=self.head_dim) + core_attn_out = self.o_norm(core_attn_out_non_spec, g) + core_attn_out = rearrange(core_attn_out, "1 n h d -> n (h d)") + + output[:] = self.o_proj(core_attn_out)[0] diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py index fd4567ee47..0a2742ff49 100644 --- a/vllm/model_executor/layers/mamba/linear_attn.py +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -77,7 +77,7 @@ class MiniMaxText01RMSNormTP(CustomOp): if self.tp_world > 1: variance = tensor_model_parallel_all_reduce(variance) / self.tp_world x = x * torch.rsqrt(variance + self.variance_epsilon) - x = x.to(orig_dtype) * self.weight + x = (x * self.weight).to(orig_dtype) return x def forward( diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py index 91a4562358..831dab2fbb 100644 --- a/vllm/model_executor/layers/mamba/mamba_utils.py +++ b/vllm/model_executor/layers/mamba/mamba_utils.py @@ -80,6 +80,15 @@ class MambaStateDtypeCalculator: state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) return (state_dtype, state_dtype) + @classmethod + def kda_state_dtype( + cls, + model_dtype: ModelDType | torch.dtype, + mamba_cache_dtype: MambaDType, + ): + state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) + return (state_dtype, state_dtype, state_dtype, torch.float32) + class MambaStateShapeCalculator: @classmethod @@ -182,3 +191,35 @@ class MambaStateShapeCalculator: head_v_dim, ) return conv_state_shape, temporal_state_shape + + @classmethod + def kda_state_shape( + cls, + tp_world_size: int, + num_heads: int, + head_dim: int, + num_k_heads: int | None = None, + head_k_dim: int | None = None, + conv_kernel_size: int = 4, + num_spec: int = 0, + ) -> tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int, int]]: + if num_k_heads is None: + num_k_heads = num_heads + if head_k_dim is None: + head_k_dim = head_dim + + proj_size = num_heads * head_dim + proj_k_size = num_k_heads * head_k_dim + + conv_state_shape = (divide(proj_size, tp_world_size), conv_kernel_size - 1) + conv_state_k_shape = (divide(proj_k_size, tp_world_size), conv_kernel_size - 1) + recurrent_state_shape = (divide(num_heads, tp_world_size), head_dim, head_dim) + + conv_state_shape = conv_state_shape[1], conv_state_shape[0] + conv_state_k_shape = conv_state_k_shape[1], conv_state_k_shape[0] + return ( + conv_state_shape, + conv_state_k_shape, + conv_state_k_shape, + recurrent_state_shape, + ) diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index 34f05f2ee9..c4c44b83ae 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -147,9 +147,10 @@ class MultiHeadLatentAttentionWrapper(CustomOp): # Add head dim of 1 to k_pe k_pe = k_pe.unsqueeze(1) - q[..., self.qk_nope_head_dim :], k_pe = self.rotary_emb( - positions, q[..., self.qk_nope_head_dim :], k_pe - ) + if self.rotary_emb is not None: + q[..., self.qk_nope_head_dim :], k_pe = self.rotary_emb( + positions, q[..., self.qk_nope_head_dim :], k_pe + ) if self.indexer and self.is_sparse: _topk_indices = self.indexer(hidden_states, q_c, positions, self.rotary_emb) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 145f18f235..7dd02e32ff 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -607,7 +607,7 @@ class ClassifierPooler(Pooler): pooled_data -= self.logit_bias pooling_params = get_pooling_params(pooling_metadata) - flags = [p.activation for p in pooling_params] + flags = [p.use_activation for p in pooling_params] if len(set(flags)) == 1: scores = self.act_fn(pooled_data) if flags[0] else pooled_data @@ -681,7 +681,7 @@ class TokenClassifierPoolerHead(nn.Module): if self.logit_bias is not None: scores -= self.logit_bias - if pooling_param.activation: + if pooling_param.use_activation: scores = self.act_fn(scores) # scores shape: [n_token, num_labels] diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 711902f0cc..91276320df 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -14,7 +14,7 @@ from .rocm_aiter_rope_ops import ( @CustomOp.register("rotary_embedding") -class RotaryEmbedding(CustomOp): +class RotaryEmbeddingBase(CustomOp): """Original rotary positional embedding.""" def __init__( @@ -86,6 +86,21 @@ class RotaryEmbedding(CustomOp): ): self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype) + +class RotaryEmbedding(RotaryEmbeddingBase): + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + ) -> None: + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + def forward_native( self, positions: torch.Tensor, diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py index 2e5efec066..d9134f05fd 100644 --- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -7,7 +7,7 @@ import torch from vllm.platforms import current_platform -from .base import RotaryEmbedding +from .base import RotaryEmbeddingBase from .common import ( rotate_gptj, rotate_neox, @@ -22,7 +22,7 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: return 0.1 * mscale * math.log(scale) + 1.0 -class DeepseekScalingRotaryEmbedding(RotaryEmbedding): +class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase): """RotaryEmbedding extended with YaRN method. Credits to Peng et al. github.com/jquesnelle/yarn diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py index 6241cb5abb..9fdac309df 100644 --- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py @@ -5,10 +5,10 @@ import math import torch -from .base import RotaryEmbedding +from .base import RotaryEmbeddingBase -class Llama4VisionRotaryEmbedding(RotaryEmbedding): +class Llama4VisionRotaryEmbedding(RotaryEmbeddingBase): def __init__( self, head_size: int, @@ -78,10 +78,3 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding): key: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor | None]: return self.forward_native(query, key) - - def forward_hip( # type: ignore[override] - self, - query: torch.Tensor, - key: torch.Tensor | None = None, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - return self.forward_native(query, key) diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index d269733083..3c184ce9d6 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -7,7 +7,7 @@ import torch from vllm.triton_utils import tl, triton -from .base import RotaryEmbedding +from .base import RotaryEmbeddingBase from .common import apply_rotary_emb_dispatch from .yarn_scaling_rope import YaRNScalingRotaryEmbedding, yarn_get_mscale @@ -199,7 +199,7 @@ def apply_interleaved_rope(x: torch.Tensor, mrope_section: list[int]) -> torch.T return x_t -class MRotaryEmbedding(RotaryEmbedding): +class MRotaryEmbedding(RotaryEmbeddingBase): """Rotary Embedding with Multimodal Sections.""" def __init__( @@ -357,24 +357,6 @@ class MRotaryEmbedding(RotaryEmbedding): key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) return query, key - def forward_xpu( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor | None = None, - offsets: torch.Tensor | None = None, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - return self.forward_native(positions, query, key, offsets) - - def forward_cpu( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor | None = None, - offsets: torch.Tensor | None = None, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - return self.forward_native(positions, query, key, offsets) - @staticmethod def get_next_input_positions( mrope_position_delta: int, diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index da5eea02d1..925f9ac0a1 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -119,17 +119,17 @@ def rocm_unquantized_gemm_impl( if use_skinny is not True: return torch.nn.functional.linear(x, weight, bias) - x_view = x.view(-1, x.size(-1)) + x_view = x.reshape(-1, x.size(-1)) n = x_view.shape[0] m = weight.shape[0] cu_count = current_platform.get_cu_count() if m > 8 and 0 < n <= 4: out = ops.wvSplitK(weight, x_view, cu_count, bias) - return out.view(*x.shape[:-1], weight.shape[0]) + return out.reshape(*x.shape[:-1], weight.shape[0]) elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None: out = ops.LLMM1(weight, x_view, 4) - return out.view(*x.shape[:-1], weight.shape[0]) + return out.reshape(*x.shape[:-1], weight.shape[0]) return torch.nn.functional.linear(x, weight, bias) diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index 079e316864..93da07c550 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -27,9 +27,16 @@ class RunaiModelStreamerLoader(BaseModelLoader): def __init__(self, load_config: LoadConfig): super().__init__(load_config) + + self._is_distributed = False if load_config.model_loader_extra_config: extra_config = load_config.model_loader_extra_config + if "distributed" in extra_config and isinstance( + extra_config.get("distributed"), bool + ): + self._is_distributed = extra_config.get("distributed") + if "concurrency" in extra_config and isinstance( extra_config.get("concurrency"), int ): @@ -92,8 +99,7 @@ class RunaiModelStreamerLoader(BaseModelLoader): """Get an iterator for the model weights based on the load format.""" hf_weights_files = self._prepare_weights(model_or_path, revision) return runai_safetensors_weights_iterator( - hf_weights_files, - self.load_config.use_tqdm_on_load, + hf_weights_files, self.load_config.use_tqdm_on_load, self._is_distributed ) def download_model(self, model_config: ModelConfig) -> None: diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 5a9faefa4d..3dbe803f99 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -657,10 +657,22 @@ def multi_thread_safetensors_weights_iterator( def runai_safetensors_weights_iterator( hf_weights_files: list[str], use_tqdm_on_load: bool, + is_distributed: bool = False, ) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" with SafetensorsStreamer() as streamer: - streamer.stream_files(hf_weights_files) + is_cuda_alike = current_platform.is_cuda_alike() + device = ( + f"cuda:{current_platform.current_device()}" + if is_distributed and is_cuda_alike + else "cpu" + ) + + streamer.stream_files( + hf_weights_files, + device=device, + is_distributed=is_distributed, + ) total_tensors = sum( len(tensors_meta) for tensors_meta in streamer.files_to_tensors_metadata.values() @@ -672,6 +684,7 @@ def runai_safetensors_weights_iterator( desc="Loading safetensors using Runai Model Streamer", bar_format=_BAR_FORMAT, disable=not enable_tqdm(use_tqdm_on_load), + mininterval=2, ) yield from tensor_iter diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index ac5949cda9..b0a48a9f1d 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy +from math import lcm from typing import TYPE_CHECKING import vllm.envs as envs @@ -8,7 +9,7 @@ from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE -from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec +from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec if TYPE_CHECKING: from vllm.config import VllmConfig @@ -53,8 +54,8 @@ class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: pooler_config = vllm_config.model_config.pooler_config - if pooler_config.activation is None: - pooler_config.activation = False + if pooler_config.use_activation is None: + pooler_config.use_activation = False class JinaRobertaModelConfig(VerifyAndUpdateConfig): @@ -347,12 +348,28 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] # get attention page size (for 1 token) - attn_page_size_1_token = FullAttentionSpec( - block_size=1, - num_kv_heads=model_config.get_num_kv_heads(parallel_config), - head_size=model_config.get_head_size(), - dtype=kv_cache_dtype, - ).page_size_bytes + # Attention backend constraints: + # - FlashAttention (FA) requires block size to be multiple of 16 + # - MLA (Multi-head Latent Attention) requires larger alignment: + # * CUTLASS_MLA backend: kernel_block_size 128 alignment + # * Other MLA backends: kernel_block_size 64 alignment + if model_config.use_mla: + use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA" + kernel_block_alignment_size = 128 if use_cutlass_mla else 64 + attn_page_size_1_token = MLAAttentionSpec( + block_size=1, + num_kv_heads=model_config.get_num_kv_heads(parallel_config), + head_size=model_config.get_head_size(), + dtype=kv_cache_dtype, + ).page_size_bytes + else: + kernel_block_alignment_size = 16 + attn_page_size_1_token = FullAttentionSpec( + block_size=1, + num_kv_heads=model_config.get_num_kv_heads(parallel_config), + head_size=model_config.get_head_size(), + dtype=kv_cache_dtype, + ).page_size_bytes model_cls, _ = ModelRegistry.resolve_model_cls( model_config.architecture, @@ -372,17 +389,6 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): if mamba_page_size == 0: return - # Attention backend constraints: - # - FlashAttention (FA) requires block size to be multiple of 16 - # - MLA (Multi-head Latent Attention) requires larger alignment: - # * CUTLASS_MLA backend: 128-byte alignment - # * Other MLA backends: 64-byte alignment - if model_config.use_mla: - use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA" - kernel_block_alignment_size = 128 if use_cutlass_mla else 64 - else: - kernel_block_alignment_size = 16 - if cache_config.enable_prefix_caching: # With prefix caching, select attention block size to # optimize for mamba kernel performance @@ -400,15 +406,8 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): # easily by changing the way we layout chunks in the # mamba2 kernels. - from math import gcd - - def lcm(a, b): - return a * b // gcd(a, b) - - base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size() - + base_chunk_size = model_config.get_mamba_chunk_size() attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token) - chunk_size = lcm(base_chunk_size, kernel_block_alignment_size) attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size) cache_config.mamba_block_size = attn_block_size diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py new file mode 100644 index 0000000000..a60a8d764d --- /dev/null +++ b/vllm/model_executor/models/kimi_linear.py @@ -0,0 +1,663 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig +from vllm.distributed import ( + get_pp_group, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.kda import KimiDeltaAttention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateDtypeCalculator, + MambaStateShapeCalculator, +) +from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig + +from .interfaces import HasInnerState, IsHybrid, MixtureOfExperts, SupportsPP +from .utils import ( + PPMissingLayer, + is_pp_missing_parameter, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class KimiMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: QKVParallelLinear | None = None, + reduce_results: bool = True, + prefix: str = "", + ) -> None: + super().__init__() + + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {hidden_act}. Only silu is supported for now." + ) + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class KimiMoE(nn.Module): + def __init__( + self, + config: KimiLinearConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + layer_idx: int = 0, + ): + super().__init__() + hidden_size = config.hidden_size + intermediate_size = config.intermediate_size + moe_intermediate_size = config.moe_intermediate_size + num_experts = config.num_experts + moe_renormalize = config.moe_renormalize + self.tp_size = get_tensor_model_parallel_world_size() + self.routed_scaling_factor = config.routed_scaling_factor + self.num_shared_experts = config.num_shared_experts + self.layer_idx = layer_idx + + if config.hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now." + ) + + # Gate always runs at half / full precision for now. + self.gate = ReplicatedLinear( + hidden_size, + num_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate", + ) + + self.gate.e_score_correction_bias = nn.Parameter(torch.empty(num_experts)) + + self.experts = FusedMoE( + num_experts=num_experts, + top_k=config.num_experts_per_token, + hidden_size=hidden_size, + intermediate_size=moe_intermediate_size, + reduce_results=False, + renormalize=moe_renormalize, + quant_config=quant_config, + use_grouped_topk=config.use_grouped_topk, + num_expert_group=config.num_expert_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=config.moe_router_activation_func, + e_score_correction_bias=self.gate.e_score_correction_bias, + ) + + if self.num_shared_experts is not None: + intermediate_size = moe_intermediate_size * self.num_shared_experts + self.shared_experts = KimiMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_size = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_size) + if self.num_shared_experts is not None: + shared_output = self.shared_experts(hidden_states) + router_logits, _ = self.gate(hidden_states) + final_hidden_states = ( + self.experts(hidden_states=hidden_states, router_logits=router_logits) + * self.routed_scaling_factor + ) + if shared_output is not None: + final_hidden_states = final_hidden_states + shared_output + + if self.tp_size > 1: + final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) + return final_hidden_states.view(num_tokens, hidden_size) + + +class KimiMLAAttention(nn.Module): + """ + Main reference: DeepseekV2 vllm Implementation + """ + + def __init__( + self, + config: KimiLinearConfig, + hidden_size: int, + num_heads: int, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: int | None, + kv_lora_rank: int, + rope_theta: float = 10000, + use_nope: bool = False, + rope_scaling: dict[str, Any] | None = None, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + **kwargs, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim + self.v_head_dim = v_head_dim + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.num_heads = num_heads + tp_size = get_tensor_model_parallel_world_size() + self.num_local_heads = num_heads // tp_size + self.scaling = self.qk_head_dim**-0.5 + self.rope_theta = rope_theta + self.use_nope = use_nope + assert self.use_nope is True + assert self.q_lora_rank is None + assert rope_scaling is None + assert num_heads % tp_size == 0 + self.kv_a_proj_with_mqa = ReplicatedLinear( + self.hidden_size, + self.kv_lora_rank + self.qk_rope_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_a_proj_with_mqa", + ) + self.q_proj = ColumnParallelLinear( + self.hidden_size, + self.num_heads * self.qk_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_proj", + ) + self.kv_a_layernorm = RMSNorm( + self.kv_lora_rank, + eps=config.rms_norm_eps, + ) + self.kv_b_proj = ColumnParallelLinear( + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_b_proj", + ) + self.o_proj = RowParallelLinear( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + mla_modules = MLAModules( + kv_a_layernorm=self.kv_a_layernorm, + kv_b_proj=self.kv_b_proj, + rotary_emb=None, + o_proj=self.o_proj, + fused_qkv_a_proj=None, + kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, + q_a_layernorm=None, + q_b_proj=None, + q_proj=self.q_proj, + indexer=None, + is_sparse=False, + topk_indices_buffer=None, + ) + self.mla_attn = MultiHeadLatentAttentionWrapper( + self.hidden_size, + self.num_local_heads, + self.scaling, + self.qk_nope_head_dim, + self.qk_rope_head_dim, + self.v_head_dim, + self.q_lora_rank, + self.kv_lora_rank, + mla_modules, + cache_config, + quant_config, + prefix, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + output: torch.Tensor, + ) -> None: + output[:] = self.mla_attn(positions, hidden_states) + + +class KimiDecoderLayer(nn.Module): + def __init__( + self, + config: KimiLinearConfig, + layer_idx: int, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + parallel_config: ParallelConfig | None = None, + model_config: ModelConfig | None = None, + prefix: str = "", + **kwargs, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + + self.is_moe = config.is_moe + + if config.is_kda_layer(layer_idx): + self.self_attn = KimiDeltaAttention( + layer_idx=layer_idx, + hidden_size=config.hidden_size, + quant_config=quant_config, + cache_config=cache_config, + model_config=config, + prefix=f"{prefix}.self_attn", + ) + else: + self.self_attn = KimiMLAAttention( + layer_idx=layer_idx, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + quant_config=quant_config, + cache_config=cache_config, + model_config=model_config, + prefix=f"{prefix}.self_attn", + config=config, + qk_nope_head_dim=config.qk_nope_head_dim, + qk_rope_head_dim=config.qk_rope_head_dim, + v_head_dim=config.v_head_dim, + q_lora_rank=config.q_lora_rank, + kv_lora_rank=config.kv_lora_rank, + use_nope=config.mla_use_nope, + ) + + if ( + self.is_moe + and config.num_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0 + ): + self.block_sparse_moe = KimiMoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.mlp = self.block_sparse_moe + else: + self.mlp = KimiMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + **kwargs, + ) -> tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + attn_output = torch.empty_like(hidden_states) + self.self_attn( + hidden_states=hidden_states, + positions=positions, + output=attn_output, + ) + hidden_states = attn_output + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +@support_torch_compile +class KimiLinearModel(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_text_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + parallel_config = vllm_config.parallel_config + self.config = config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + extra_kwargs = {} + + def get_layer(prefix: str): + layer_idx = int(prefix.rsplit(".", 1)[1]) + return KimiDecoderLayer( + config, + layer_idx, + cache_config, + quant_config, + parallel_config, + model_config, + prefix, + **extra_kwargs, + ) + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + get_layer, + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + world_size = get_tensor_model_parallel_world_size() + assert config.num_attention_heads % world_size == 0, ( + "num_attention_heads must be divisible by world_size" + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for _, layer in enumerate(self.layers[self.start_layer : self.end_layer]): + hidden_states, residual = layer( + positions=positions, + hidden_states=hidden_states, + residual=residual, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class KimiLinearForCausalLM( + nn.Module, HasInnerState, SupportsPP, MixtureOfExperts, IsHybrid +): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model_config = vllm_config.model_config + self.vllm_config = vllm_config + self.config = self.model_config.hf_config + quant_config = vllm_config.quant_config + self.quant_config = quant_config + self.model = KimiLinearModel( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + else: + self.lm_head = PPMissingLayer() + logit_scale = getattr(self.config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor( + self.config.vocab_size, scale=logit_scale + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs + ) + return hidden_states + + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype, torch.dtype, torch.dtype]: + return MambaStateDtypeCalculator.kda_state_dtype( + vllm_config.model_config.dtype, vllm_config.cache_config.mamba_cache_dtype + ) + + @classmethod + def get_mamba_state_shape_from_config( + cls, vllm_config: "VllmConfig" + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + parallel_config = vllm_config.parallel_config + hf_config = vllm_config.model_config.hf_config + tp_size = parallel_config.tensor_parallel_size + num_spec = ( + vllm_config.speculative_config.num_speculative_tokens + if vllm_config.speculative_config + else 0 + ) + return MambaStateShapeCalculator.kda_state_shape( + tp_size, + hf_config.linear_attn_config["num_heads"], + hf_config.linear_attn_config["head_dim"], + conv_kernel_size=hf_config.linear_attn_config["short_conv_kernel_size"], + num_spec=num_spec, + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + return self.logits_processor(self.lm_head, hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + if self.config.is_moe: + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_experts, + ) + else: + expert_params_mapping = [] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for args in weights: + name, loaded_weight = args[:2] + kwargs = args[2] if len(args) > 2 else {} + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for idx, (param_name, weight_name, expert_id, shard_id) in enumerate( + expert_params_mapping + ): + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + expert_id=expert_id, + shard_id=shard_id, + ) + break + else: + # Skip loading extra bias for GPTQ models. + if ( + name.endswith(".bias") + and name not in params_dict + and not self.config.is_linear_attn + ): # noqa: E501 + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight, **kwargs) + loaded_params.add(name) + + +def get_spec_layer_idx_from_weight_name( + config: KimiLinearConfig, weight_name: str +) -> int | None: + if hasattr(config, "num_nextn_predict_layers") and ( + config.num_nextn_predict_layers > 0 + ): + layer_idx = config.num_hidden_layers + for i in range(config.num_nextn_predict_layers): + if weight_name.startswith(f"model.layers.{layer_idx + i}."): + return layer_idx + i + return None diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index dadb8a19c0..21ed428a05 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -263,23 +263,6 @@ class MiniMaxM2DecoderLayer(nn.Module): # with the layer's index. layer_idx = int(prefix.split(sep=".")[-1]) - # TODO: support MTP - attn_window_size = getattr(config, "attn_window_size", None) - if attn_window_size is not None: - if isinstance(attn_window_size, list): - attn_window_size = attn_window_size[layer_idx] - elif isinstance(attn_window_size, int): - attn_window_size = attn_window_size - else: - raise ValueError(f"Invalid attn_window_size: {attn_window_size}") - attn_window_size = None if attn_window_size <= 0 else attn_window_size - - # different rope theta for full layer and swa layer - swa_rope_theta = getattr(config, "swa_rope_theta", -1) - # default to full rope theta - swa_rope_theta = rope_theta if swa_rope_theta <= 0 else swa_rope_theta - rope_theta = swa_rope_theta if attn_window_size is not None else rope_theta - self.layer_idx = layer_idx self.self_attn = MiniMaxM2Attention( hidden_size=self.hidden_size, @@ -288,7 +271,6 @@ class MiniMaxM2DecoderLayer(nn.Module): rotary_dim=config.rotary_dim, rope_theta=rope_theta, rope_scaling=rope_scaling, - attn_window_size=attn_window_size, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py new file mode 100644 index 0000000000..b8dad909c5 --- /dev/null +++ b/vllm/model_executor/models/ouro.py @@ -0,0 +1,518 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py +# Copyright 2024 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Ouro model compatible with HuggingFace weights.""" + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.attention import Attention, AttentionType +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsLoRA +from .utils import ( + AutoWeightsLoader, + extract_layer_index, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + + +class OuroMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {hidden_act}. Only silu is supported for now." + ) + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class OuroAttention(nn.Module): + def __init__( + self, + config: PretrainedConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 4096 * 32, + rope_theta: float = 10000, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + rope_scaling: tuple | None = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + dual_chunk_attention_config: dict[str, Any] | None = None, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.dual_chunk_attention_config = dual_chunk_attention_config + + # Get total_ut_steps from config, default to 4 if not specified + total_ut_steps = getattr(config, "total_ut_steps", 4) + + # Use total number of hidden layers instead of hardcoded 24 + total_layers = config.num_hidden_layers + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=self.rope_theta, + rope_scaling=rope_scaling, + dual_chunk_attention_config=dual_chunk_attention_config, + ) + self.attn = nn.ModuleList() + for ut_step in range(total_ut_steps): + base_layer_idx = extract_layer_index(prefix) + unique_layer_idx = ut_step * total_layers + base_layer_idx + + unique_prefix = prefix.replace( + f"layers.{base_layer_idx}", f"layers.{unique_layer_idx}" + ) + + self.attn.append( + Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + attn_type=attn_type, + prefix=f"{unique_prefix}.attn", + **{ + "layer_idx": unique_layer_idx, + "dual_chunk_attention_config": dual_chunk_attention_config, + } + if dual_chunk_attention_config + else {}, + ) + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + current_ut: int, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn[current_ut](q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class OuroDecoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + # Requires transformers > 4.32.0 + rope_theta = getattr(config, "rope_theta", 1000000) + rope_scaling = getattr(config, "rope_scaling", None) + dual_chunk_attention_config = getattr( + config, "dual_chunk_attention_config", None + ) + + if getattr(config, "is_causal", True): + attn_type = AttentionType.DECODER + else: + attn_type = AttentionType.ENCODER_ONLY + + self.self_attn = OuroAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + max_position=config.max_position_embeddings, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + cache_config=cache_config, + quant_config=quant_config, + rope_scaling=rope_scaling, + prefix=f"{prefix}.self_attn", + attn_type=attn_type, + dual_chunk_attention_config=dual_chunk_attention_config, + ) + self.mlp = OuroMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.input_layernorm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.post_attention_layernorm_2 = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + current_ut: int, + residual: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, hidden_states=hidden_states, current_ut=current_ut + ) + hidden_states = self.input_layernorm_2(hidden_states) + + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_attention_layernorm_2(hidden_states) + + return hidden_states, residual + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + } +) +class OuroModel(nn.Module): + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + decoder_layer_type: type[nn.Module] = OuroDecoderLayer, + ): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + # TODO (@robertgshaw2): see if this can be moved out + if cache_config.sliding_window is not None and hasattr( + config, "max_window_layers" + ): + assert config.max_window_layers == config.num_hidden_layers, ( + "Sliding window for some but all layers is not supported. " + "This model uses sliding window but `max_window_layers` = {} " + "is less than `num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( + config.max_window_layers, + config.num_hidden_layers, + ) + ) + + self.config = config + self.quant_config = quant_config + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", + ) + + # Use the provided decoder layer type or default to OuroDecoderLayer + decoder_layer_type = decoder_layer_type or OuroDecoderLayer + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: decoder_layer_type( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ), + prefix=f"{prefix}.layers", + ) + + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.early_exit_gate = RowParallelLinear(config.hidden_size, 1, bias=True) + + self.total_ut_steps = getattr(self.config, "total_ut_steps", 4) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + + for current_ut in range(self.total_ut_steps): + residual = None + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer( + positions, hidden_states, current_ut, residual + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + loaded_weight = ( + loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] + ) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class OuroForCausalLM(nn.Module, SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.lora_config = lora_config + + self.quant_config = quant_config + self.model = OuroModel( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + + self.logits_processor = LogitsProcessor(config.vocab_size) + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 6338ea93b8..677d34dea3 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -45,6 +45,7 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2_5_vl import ( @@ -759,7 +760,8 @@ class Qwen2_5OmniConditionalGenerationMixin: assert grid_thw.ndim == 2 pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + with set_forward_context(None, self.vllm_config): + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size sizes = grid_thw.prod(-1) // merge_size // merge_size @@ -779,7 +781,8 @@ class Qwen2_5OmniConditionalGenerationMixin: assert grid_thw.ndim == 2 pixel_values_videos = video_input["pixel_values_videos"].type(self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + with set_forward_context(None, self.vllm_config): + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size sizes = grid_thw.prod(-1) // merge_size // merge_size @@ -839,6 +842,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration( def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + self.vllm_config = vllm_config thinker_config: Qwen2_5OmniThinkerConfig = ( vllm_config.model_config.hf_config.thinker_config ) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index b622021e22..3d67653726 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -31,10 +31,10 @@ from collections.abc import Callable, Iterable, Mapping, Sequence from functools import lru_cache, partial from typing import Annotated, Any, Literal, TypeAlias +import einops import torch import torch.nn as nn import torch.nn.functional as F -from einops import rearrange from transformers import BatchFeature, PretrainedConfig from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( @@ -43,13 +43,16 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( ) from vllm.attention.backends.registry import _Backend -from vllm.attention.layer import ( - check_upstream_fa_availability, - maybe_get_vit_flash_attn_backend, +from vllm.attention.layer import maybe_get_vit_flash_attn_backend +from vllm.attention.ops.vit_attn_wrappers import ( + vit_flash_attn_wrapper, + vit_xformers_attn_wrapper, ) +from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm @@ -312,6 +315,7 @@ class Qwen2_5_VisionAttention(nn.Module): use_data_parallel: bool = False, attn_backend: _Backend = _Backend.TORCH_SDPA, use_upstream_fa: bool = False, + attn_backend_override: _Backend | None = None, ) -> None: super().__init__() # Per attention head and per partition values. @@ -352,8 +356,14 @@ class Qwen2_5_VisionAttention(nn.Module): maybe_get_vit_flash_attn_backend( self.attn_backend, self.use_upstream_fa, + attn_backend_override=attn_backend_override, ) ) + # On ROCm with FLASH_ATTN backend, upstream flash_attn is used + from vllm.platforms import current_platform + + if current_platform.is_rocm() and self.attn_backend == _Backend.FLASH_ATTN: + self.use_upstream_fa = True self.is_flash_attn_backend = self.attn_backend in { _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA, @@ -392,8 +402,8 @@ class Qwen2_5_VisionAttention(nn.Module): x: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, - max_seqlen: int | None = None, # Only used for Flash Attention - seqlens: list[int] | None = None, # Only used for xFormers + max_seqlen: torch.Tensor, # Only used for Flash Attention + seqlens: torch.Tensor, # Only used for xFormers ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) @@ -402,7 +412,7 @@ class Qwen2_5_VisionAttention(nn.Module): q, k, v = self.split_qkv(x) batch_size = q.shape[1] - q, k, v = (rearrange(x, "s b ... -> b s ...") for x in (q, k, v)) + q, k, v = (einops.rearrange(x, "s b ... -> b s ...") for x in (q, k, v)) if rotary_pos_emb is not None: # [2 * b, s, heads, head_dim] qk_concat = torch.cat([q, k], dim=0) @@ -410,27 +420,22 @@ class Qwen2_5_VisionAttention(nn.Module): q, k = torch.chunk(qk_rotated, 2, dim=0) if self.is_flash_attn_backend: - q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) - - output = self.flash_attn_varlen_func( + context_layer = vit_flash_attn_wrapper( q, k, v, - cu_seqlens_q=cu_seqlens, - cu_seqlens_k=cu_seqlens, - max_seqlen_q=max_seqlen, - max_seqlen_k=max_seqlen, - dropout_p=0.0, - causal=False, + cu_seqlens, + max_seqlen, + batch_size, + self.attn_backend == _Backend.ROCM_AITER_FA, + self.use_upstream_fa, ) - - context_layer = rearrange( - output, "(b s) h d -> s b (h d)", b=batch_size - ).contiguous() elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. from vllm.platforms import current_platform + # Never remove the next contiguous logic + # Without it, hallucinations occur with the backend if current_platform.is_rocm(): q = q.contiguous() k = k.contiguous() @@ -443,34 +448,33 @@ class Qwen2_5_VisionAttention(nn.Module): k_i = k[:, start_idx:end_idx] v_i = v[:, start_idx:end_idx] q_i, k_i, v_i = ( - rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] + einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] ) output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0) - output_i = rearrange(output_i, "b h s d -> b s h d ") + output_i = einops.rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) - context_layer = rearrange( + context_layer = einops.rearrange( context_layer, "b s h d -> s b (h d)" ).contiguous() elif self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalMask - - attn_bias = BlockDiagonalMask.from_seqlens( - q_seqlen=seqlens, kv_seqlen=None, device=q.device - ) - - context_layer = xops.memory_efficient_attention_forward( - q, k, v, attn_bias=attn_bias, p=0, scale=None - ) - context_layer = rearrange( - context_layer, "b s h d -> s b (h d)" - ).contiguous() + context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens) output, _ = self.proj(context_layer) return output +# (FIXME): Enable this after dynamic slicing is fixed +# See https://github.com/vllm-project/vllm/pull/27760 +# @support_torch_compile( +# dynamic_arg_dims={ +# "x": 0, +# "cu_seqlens": 0, +# "rotary_pos_emb": 0, +# "seqlens": 0, +# }, +# mark_unbacked_dims={"seqlens": 0}, +# ) class Qwen2_5_VisionBlock(nn.Module): def __init__( self, @@ -484,6 +488,7 @@ class Qwen2_5_VisionBlock(nn.Module): use_data_parallel: bool = False, attn_backend: _Backend = _Backend.TORCH_SDPA, use_upstream_fa: bool = False, + attn_backend_override: _Backend | None = None, ) -> None: super().__init__() if norm_layer is None: @@ -499,6 +504,7 @@ class Qwen2_5_VisionBlock(nn.Module): use_data_parallel=use_data_parallel, attn_backend=attn_backend, use_upstream_fa=use_upstream_fa, + attn_backend_override=attn_backend_override, ) self.mlp = Qwen2_5_VisionMLP( dim, @@ -515,8 +521,8 @@ class Qwen2_5_VisionBlock(nn.Module): x: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, - max_seqlen: int | None = None, # Only used for Flash Attention - seqlens: list[int] | None = None, # Only used for xFormers + max_seqlen: torch.Tensor, # Only used for Flash Attention + seqlens: torch.Tensor, # Only used for xFormers ) -> torch.Tensor: x_attn = self.attn( self.norm1(x), @@ -530,6 +536,11 @@ class Qwen2_5_VisionBlock(nn.Module): return x +@support_torch_compile( + dynamic_arg_dims={ + "x": 0, + } +) class Qwen2_5_VisionPatchEmbed(nn.Module): def __init__( self, @@ -556,6 +567,11 @@ class Qwen2_5_VisionPatchEmbed(nn.Module): return x +@support_torch_compile( + dynamic_arg_dims={ + "x": 0, + } +) class Qwen2_5_VisionPatchMerger(nn.Module): def __init__( self, @@ -665,13 +681,18 @@ class Qwen2_5_VisionTransformer(nn.Module): self.spatial_merge_size = vision_config.spatial_merge_size self.fullatt_block_indexes = vision_config.fullatt_block_indexes self.spatial_merge_unit = self.spatial_merge_size**2 + # TODO[@lucaskabela]: Investigate fixing this usage + # see https://github.com/vllm-project/vllm/issues/27044 + # DO NOT MOVE THIS IMPORT + from vllm.compilation.backends import set_model_tag - self.patch_embed = Qwen2_5_VisionPatchEmbed( - patch_size=patch_size, - temporal_patch_size=temporal_patch_size, - in_channels=in_channels, - hidden_size=self.hidden_size, - ) + with set_model_tag("Qwen2_5_VisionPatchEmbed"): + self.patch_embed = Qwen2_5_VisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_channels=in_channels, + hidden_size=self.hidden_size, + ) norm_layer = partial(RMSNorm, eps=norm_eps) head_dim = self.hidden_size // self.num_heads @@ -683,13 +704,14 @@ class Qwen2_5_VisionTransformer(nn.Module): dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - if ( - self.attn_backend != _Backend.FLASH_ATTN - and self.attn_backend != _Backend.ROCM_AITER_FA - and check_upstream_fa_availability(torch.get_default_dtype()) - ): - self.attn_backend = _Backend.FLASH_ATTN - use_upstream_fa = True + + self.attn_backend, self.flash_attn_varlen_func = ( + maybe_get_vit_flash_attn_backend( + self.attn_backend, + use_upstream_fa, + attn_backend_override=attn_backend_override, + ) + ) if self.attn_backend not in { _Backend.FLASH_ATTN, @@ -701,32 +723,36 @@ class Qwen2_5_VisionTransformer(nn.Module): f"Qwen2.5-VL does not support {self.attn_backend} backend now." ) - self.blocks = nn.ModuleList( - [ - Qwen2_5_VisionBlock( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=get_act_and_mul_fn(vision_config.hidden_act), - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}", - use_data_parallel=use_data_parallel, - attn_backend=self.attn_backend, - use_upstream_fa=use_upstream_fa, - ) - for layer_idx in range(depth) - ] - ) - self.merger = Qwen2_5_VisionPatchMerger( - d_model=vision_config.out_hidden_size, - context_dim=self.hidden_size, - norm_layer=norm_layer, - spatial_merge_size=self.spatial_merge_size, - quant_config=quant_config, - prefix=f"{prefix}.merger", - use_data_parallel=use_data_parallel, - ) + with set_model_tag("Qwen2_5_VisionBlock"): + self.blocks = nn.ModuleList( + [ + Qwen2_5_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=get_act_and_mul_fn(vision_config.hidden_act), + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}", + use_data_parallel=use_data_parallel, + attn_backend=self.attn_backend, + use_upstream_fa=use_upstream_fa, + attn_backend_override=attn_backend_override, + ) + for layer_idx in range(depth) + ] + ) + + with set_model_tag("Qwen2_5_VisionPatchMerger"): + self.merger = Qwen2_5_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + prefix=f"{prefix}.merger", + use_data_parallel=use_data_parallel, + ) @property def dtype(self) -> torch.dtype: @@ -827,15 +853,16 @@ class Qwen2_5_VisionTransformer(nn.Module): def compute_attn_mask_seqlen( self, cu_seqlens: torch.Tensor, - ) -> tuple[int | None, list[int] | None]: - max_seqlen, seqlens = None, None + ) -> tuple[torch.Tensor, torch.Tensor]: + max_seqlen = torch.zeros([], device=cu_seqlens.device) + seqlens = torch.zeros(1, device=cu_seqlens.device) if ( self.attn_backend == _Backend.FLASH_ATTN or self.attn_backend == _Backend.ROCM_AITER_FA ): - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() elif self.attn_backend == _Backend.XFORMERS: - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] return max_seqlen, seqlens @staticmethod @@ -1233,6 +1260,7 @@ class Qwen2_5_VLForConditionalGeneration( self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.config = config + self.vllm_config = vllm_config self.multimodal_config = multimodal_config self.video_pruning_rate = multimodal_config.video_pruning_rate self.is_multimodal_pruning_enabled = ( @@ -1248,7 +1276,7 @@ class Qwen2_5_VLForConditionalGeneration( else None ) self.visual = Qwen2_5_VisionTransformer( - config.vision_config, + vision_config=config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=self.quant_config, prefix=maybe_prefix(prefix, "visual"), @@ -1336,13 +1364,13 @@ class Qwen2_5_VLForConditionalGeneration( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"] - - if self.use_data_parallel: - return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values, grid_thw_list, rope_type="rope_3d" - ) - else: - image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) + with set_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, pixel_values, grid_thw_list, rope_type="rope_3d" + ) + else: + image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync @@ -1396,12 +1424,18 @@ class Qwen2_5_VLForConditionalGeneration( video_embeds = video_input["video_embeds"].type(self.visual.dtype) else: pixel_values_videos = video_input["pixel_values_videos"] - if self.use_data_parallel: - return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d" - ) - else: - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw_list) + with set_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values_videos, + grid_thw_list, + rope_type="rope_3d", + ) + else: + video_embeds = self.visual( + pixel_values_videos, grid_thw=grid_thw_list + ) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index e81ad5f68d..f452ba8715 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -159,6 +159,7 @@ class Qwen3NextSparseMoeBlock(nn.Module): self.experts = SharedFusedMoE( shared_experts=self.shared_expert, + gate=self.gate, num_experts=self.n_routed_experts, top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, @@ -181,11 +182,17 @@ class Qwen3NextSparseMoeBlock(nn.Module): if self.is_sequence_parallel: hidden_states = sequence_parallel_chunk(hidden_states) - # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=router_logits - ) + if self.experts.is_internal_router: + # In this case, the gate/router runs inside the FusedMoE class + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=hidden_states + ) + else: + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) if self.shared_expert is not None: final_hidden_states = final_hidden_states[0] + final_hidden_states[1] diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index f3b6ad495d..efcd003fbb 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -223,8 +223,8 @@ class Qwen3_VisionBlock(nn.Module): x: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, - max_seqlen: int | None = None, # Only used for Flash Attention - seqlens: list[int] | None = None, # Only used for xFormers + max_seqlen: torch.Tensor, # Only used for Flash Attention + seqlens: torch.Tensor, # Only used for xFormers ) -> torch.Tensor: x = x + self.attn( self.norm1(x), @@ -488,12 +488,13 @@ class Qwen3Omni_VisionTransformer(nn.Module): def compute_attn_mask_seqlen( self, cu_seqlens: torch.Tensor, - ) -> tuple[int | None, list[int] | None]: - max_seqlen, seqlens = None, None + ) -> tuple[torch.Tensor, torch.Tensor]: + max_seqlen = torch.zeros([], device=cu_seqlens.device) + seqlens = torch.zeros(1, device=cu_seqlens.device) if self.attn_backend == _Backend.FLASH_ATTN: - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() elif self.attn_backend == _Backend.XFORMERS: - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] return max_seqlen, seqlens def forward( @@ -1114,6 +1115,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + self.vllm_config = vllm_config # needed for torch compile forward context thinker_config: Qwen3OmniMoeThinkerConfig = ( vllm_config.model_config.hf_config.thinker_config ) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 10c0eb4eb6..d611580c71 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -231,8 +231,8 @@ class Qwen3_VisionBlock(nn.Module): x: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, - max_seqlen: int | None = None, # Only used for Flash Attention - seqlens: list[int] | None = None, # Only used for xFormers + max_seqlen: torch.Tensor, # Only used for Flash Attention + seqlens: torch.Tensor, # Only used for xFormers ) -> torch.Tensor: x = x + self.attn( self.norm1(x), @@ -512,15 +512,16 @@ class Qwen3_VisionTransformer(nn.Module): def compute_attn_mask_seqlen( self, cu_seqlens: torch.Tensor, - ) -> tuple[int | None, list[int] | None]: - max_seqlen, seqlens = None, None + ) -> tuple[torch.Tensor, torch.Tensor]: + max_seqlen = torch.zeros([], device=cu_seqlens.device) + seqlens = torch.zeros(1, device=cu_seqlens.device) if ( self.attn_backend == _Backend.FLASH_ATTN or self.attn_backend == _Backend.ROCM_AITER_FA ): - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() elif self.attn_backend == _Backend.XFORMERS: - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] return max_seqlen, seqlens def forward( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 0027954ac2..7eca1a09e5 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -118,6 +118,7 @@ _TEXT_GENERATION_MODELS = { "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), + "KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"), # noqa: E501 "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"), "Lfm2MoeForCausalLM": ("lfm2_moe", "Lfm2MoeForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), @@ -147,6 +148,7 @@ _TEXT_GENERATION_MODELS = { "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), + "OuroForCausalLM": ("ouro", "OuroForCausalLM"), "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 1ae2c7408a..2fa3f6ebcc 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -506,6 +506,11 @@ class MultiModalDataParser: for data_item in data_items: video, metadata = self._get_video_with_metadata(data_item) if self.video_needs_metadata: + if metadata is None: + raise ValueError( + "Video metadata is required but not found in mm input. " + "Please check your video input in `multi_modal_data`" + ) new_videos.append((video, metadata)) metadata_lst.append(metadata) else: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 66cffde950..cc06f034fb 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -261,6 +261,21 @@ class CudaPlatformBase(Platform): from vllm.attention.backends.registry import _Backend if use_mla: + # explicitly reject non-MLA backends when MLA is enabled to avoid + # silently selecting an incompatible backend (e.g., FLASHINFER). + if selected_backend in { + _Backend.FLASHINFER, + _Backend.FLASH_ATTN, + _Backend.TRITON_ATTN, + _Backend.TREE_ATTN, + _Backend.XFORMERS, + }: + raise ValueError( + f"Attention backend {selected_backend} incompatible with MLA. " + "Please use one of the MLA backends: FLASHINFER_MLA, CUTLASS_MLA, " + "FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set " + "VLLM_MLA_DISABLE=1 to disable MLA for this model." + ) if not use_v1: raise RuntimeError( "MLA attention backends require the V1 engine. " diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 059ed4430e..d3535c9781 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -72,6 +72,7 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = { "0x74a0": "AMD_Instinct_MI300A", "0x74a1": "AMD_Instinct_MI300X", "0x74b5": "AMD_Instinct_MI300X", # MI300X VF + "0x74a2": "AMD_Instinct_MI308X", "0x74a5": "AMD_Instinct_MI325X", "0x74b9": "AMD_Instinct_MI325X", # MI325X VF "0x74a9": "AMD_Instinct_MI300X_HF", @@ -413,7 +414,7 @@ class RocmPlatform(Platform): "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" " is not set, enabling VLLM_USE_TRITON_AWQ." ) - envs.VLLM_USE_TRITON_AWQ = True + os.environ["VLLM_USE_TRITON_AWQ"] = "1" @classmethod def get_punica_wrapper(cls) -> str: diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 090d924144..72a8320cc1 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -2,16 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy -from typing import TYPE_CHECKING, Annotated, Any, Optional +from typing import Annotated, Any, Optional import msgspec +from vllm.config import ModelConfig, PoolerConfig +from vllm.config.pooler import get_use_activation from vllm.sampling_params import RequestOutputKind from vllm.tasks import PoolingTask -if TYPE_CHECKING: - from vllm.config import ModelConfig, PoolerConfig - class PoolingParams( msgspec.Struct, @@ -25,10 +24,12 @@ class PoolingParams( Set to -1 to use the model's default truncation size. Set to k to keep only the last k tokens (left truncation). Set to None to disable truncation. - normalize: Whether to normalize the embeddings outputs. dimensions: Reduce the dimensions of embeddings if model support matryoshka representation. - activation: Whether to apply activation function to + normalize: Whether to normalize the embeddings outputs. + softmax: softmax will be deprecated, please use use_activation instead. + activation: activation will be deprecated, please use use_activation instead. + use_activation: Whether to apply activation function to the classification outputs. """ @@ -44,7 +45,9 @@ class PoolingParams( ## for classification, scoring and rerank # --8<-- [start:classification-pooling-params] + softmax: bool | None = None activation: bool | None = None + use_activation: bool | None = None # --8<-- [end:classification-pooling-params] ## for step pooling models @@ -59,16 +62,16 @@ class PoolingParams( @property def all_parameters(self) -> list[str]: - return ["dimensions", "normalize", "activation"] + return ["dimensions", "normalize", "use_activation"] @property def valid_parameters(self): return { "embed": ["dimensions", "normalize"], - "classify": ["activation"], - "score": ["activation"], + "classify": ["use_activation"], + "score": ["use_activation"], "token_embed": ["dimensions", "normalize"], - "token_classify": ["activation"], + "token_classify": ["use_activation"], } def clone(self) -> "PoolingParams": @@ -84,6 +87,9 @@ class PoolingParams( msg = f"You cannot overwrite {self.task=!r} with {task=!r}!" raise ValueError(msg) + # raise deprecated warning for softmax and activation + self.use_activation = get_use_activation(self) + # plugin task uses io_processor.parse_request to verify inputs, # skipping PoolingParams verify if self.task == "plugin": @@ -168,8 +174,8 @@ class PoolingParams( raise ValueError("Dimensions must be greater than 0") elif self.task in ["classify", "score", "token_classify"]: - if self.activation is None: - self.activation = True + if self.use_activation is None: + self.use_activation = True else: raise ValueError(f"Unknown pooling task: {self.task}") @@ -197,7 +203,7 @@ class PoolingParams( f"task={self.task}, " f"normalize={self.normalize}, " f"dimensions={self.dimensions}, " - f"activation={self.activation}, " + f"use_activation={self.use_activation}, " f"step_tag_id={self.step_tag_id}, " f"returned_token_ids={self.returned_token_ids}, " f"requires_token_ids={self.requires_token_ids}, " diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 1c0fce702b..829b63d8a7 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -7,7 +7,6 @@ from collections.abc import Callable from dataclasses import asdict, dataclass, field from typing import Any, Optional, TypeAlias -import pandas as pd from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult from torch._C._profiler import _EventType, _ExperimentalConfig, _ProfilerEvent from torch.autograd.profiler import FunctionEvent @@ -21,6 +20,12 @@ from vllm.profiler.utils import ( event_torch_op_stack_trace, indent_string, ) +from vllm.utils.import_utils import PlaceholderModule + +try: + import pandas as pd +except ImportError: + pd = PlaceholderModule("pandas") @dataclass diff --git a/vllm/test_utils.py b/vllm/test_utils.py deleted file mode 100644 index 91dcc2fd84..0000000000 --- a/vllm/test_utils.py +++ /dev/null @@ -1,129 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -MODELS_ON_S3 = [ - "adept/fuyu-8b", - "ai21labs/AI21-Jamba-1.5-Mini", - "ai21labs/Jamba-tiny-random", - "ai21labs/Jamba-tiny-reward-dev", - "allenai/Molmo-7B-D-0924", - "allenai/OLMo-1B-hf", - "allenai/OLMoE-1B-7B-0924-Instruct", - "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test", - "AMead10/Llama-3.2-1B-Instruct-AWQ", - "hmellor/Ilama-3.2-1B", - "BAAI/bge-base-en-v1.5", - "BAAI/bge-multilingual-gemma2", - "BAAI/bge-reranker-v2-m3", - "bigcode/starcoder2-3b", - "cross-encoder/ms-marco-MiniLM-L-6-v2", - "cross-encoder/quora-roberta-base", - "deepseek-ai/deepseek-vl2-tiny", - "distilbert/distilgpt2", - "facebook/bart-base", - "facebook/bart-large-cnn", - # "fixie-ai/ultravox-v0_5-llama-3_2-1b", - "google/gemma-1.1-2b-it", - "google/gemma-2-2b-it", - "google/paligemma-3b-pt-224", - "h2oai/h2ovl-mississippi-800m", - "HuggingFaceM4/Idefics3-8B-Llama3", - "internlm/internlm2-1_8b-reward", - "intfloat/e5-mistral-7b-instruct", - "intfloat/multilingual-e5-small", - "jason9693/Qwen2.5-1.5B-apeach", - "llava-hf/llava-1.5-7b-hf", - "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - "llava-hf/llava-v1.6-mistral-7b-hf", - "llava-hf/LLaVA-NeXT-Video-7B-hf", - # "meta-llama/Llama-2-7b-hf", - "meta-llama/Llama-3.2-1B", - "meta-llama/Llama-3.2-1B-Instruct", - "meta-llama/Meta-Llama-3-8B", - "microsoft/phi-2", - "microsoft/Phi-3-mini-4k-instruct", - "microsoft/Phi-3-small-8k-instruct", - "microsoft/Phi-3-vision-128k-instruct", - "microsoft/Phi-3.5-MoE-instruct", - "microsoft/Phi-3.5-vision-instruct", - # "mistralai/Mistral-7B-Instruct-v0.1", - "mistralai/Mixtral-8x7B-Instruct-v0.1", - "mistralai/Pixtral-12B-2409", - "mistral-community/Mixtral-8x22B-v0.1-AWQ", - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse", - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", - "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", - "neuralmagic/Llama-3.2-1B-quantized.w8a8", - "neuralmagic/Meta-Llama-3-8B-Instruct-FP8", - "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", - "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", - "nm-testing/llama2.c-stories42M-pruned2.4-compressed", - "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", - "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", - "nm-testing/Phi-3-mini-128k-instruct-FP8", - "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV", - "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", - "nm-testing/tinyllama-oneshot-w4a16-channel-v2", - "nm-testing/tinyllama-oneshot-w4a16-group128-v2", - "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", - "nm-testing/tinyllama-oneshot-w8a16-per-channel", - "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", - "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym", - "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", - "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", - "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", - "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme", - "nvidia/NVLM-D-72B", - "openai-community/gpt2", - # "openai/whisper-large-v3", - "openbmb/MiniCPM-o-2_6", - "openbmb/MiniCPM-V-2_6", - "OpenGVLab/InternVL2-1B", - "parasail-ai/GritLM-7B-vllm", - "Qwen/Qwen1.5-MoE-A2.7B-Chat", - "Qwen/Qwen2-7B-Instruct", - "Qwen/Qwen2-Audio-7B-Instruct", - "Qwen/Qwen2-VL-2B-Instruct", - "Qwen/Qwen2.5-1.5B-Instruct", - "Qwen/Qwen2.5-Math-PRM-7B", - "Qwen/Qwen2.5-Math-RM-72B", - "Qwen/Qwen2.5-VL-3B-Instruct", - "royokong/e5-v", - "sentence-transformers/all-roberta-large-v1", - "sentence-transformers/stsb-roberta-base-v2", - "allenai/OLMo-2-0425-1B", - "shuyuej/Llama-3.2-1B-Instruct-GPTQ", - "ssmits/Qwen2-7B-Instruct-embed-base", - "stabilityai/stablelm-3b-4e1t", - "stabilityai/stablelm-zephyr-3b", - "state-spaces/mamba-130m-hf", - "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", - "zai-org/glm-4v-9b", - "TIGER-Lab/Mantis-8B-siglip-llama3", - "TIGER-Lab/VLM2Vec-Full", - "tiiuae/falcon-40b", - "tiiuae/falcon-mamba-7b-instruct", - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "upstage/solar-pro-preview-instruct", -] - -MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights" diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 34c0429a80..b1f4e3e2a9 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -79,6 +79,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( deepseek_v3="DeepseekV3Config", deepseek_v32="DeepseekV3Config", flex_olmo="FlexOlmoConfig", + kimi_linear="KimiLinearConfig", kimi_vl="KimiVLConfig", Llama_Nemotron_Nano_VL="Nemotron_Nano_VL_Config", RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index befe9cdae7..663a8e44d7 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -19,6 +19,7 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig from vllm.transformers_utils.configs.jais import JAISConfig +from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig from vllm.transformers_utils.configs.medusa import MedusaConfig @@ -54,6 +55,7 @@ __all__ = [ "MiDashengLMConfig", "MLPSpeculatorConfig", "MoonViTConfig", + "KimiLinearConfig", "KimiVLConfig", "NemotronConfig", "NemotronHConfig", diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py new file mode 100644 index 0000000000..65ddf48c52 --- /dev/null +++ b/vllm/transformers_utils/configs/kimi_linear.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from transformers.configuration_utils import PretrainedConfig + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class KimiLinearConfig(PretrainedConfig): + model_type = "kimi_linear" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + model_type="kimi_linear", + vocab_size=163840, + hidden_size=4096, + head_dim=None, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + rope_theta=10000.0, + rope_scaling=None, + tie_word_embeddings=False, + moe_intermediate_size: int | None = None, + moe_renormalize: bool = True, + moe_router_activation_func: str = "sigmoid", + num_experts: int | None = None, + num_experts_per_token: int | None = None, + num_shared_experts: int = 0, + routed_scaling_factor: float = 1.0, + first_k_dense_replace: int = 0, + moe_layer_freq: int = 1, + use_grouped_topk: bool = True, + num_expert_group: int = 1, + topk_group: int = 1, + q_lora_rank: int | None = None, + kv_lora_rank: int | None = None, + qk_nope_head_dim: int | None = None, + qk_rope_head_dim: int | None = None, + v_head_dim: int | None = None, + mla_use_nope: bool | None = False, + num_nextn_predict_layers: int = 0, + linear_attn_config: dict | None = None, + **kwargs, + ): + self.model_type = model_type + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.head_dim = ( + head_dim if head_dim is not None else hidden_size // num_attention_heads + ) + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.mla_use_nope = mla_use_nope + # moe config + self.num_experts = num_experts + self.num_experts_per_token = num_experts_per_token + self.moe_renormalize = moe_renormalize + self.num_shared_experts = num_shared_experts + self.routed_scaling_factor = routed_scaling_factor + self.moe_router_activation_func = moe_router_activation_func + assert self.moe_router_activation_func in ("softmax", "sigmoid") + self.moe_intermediate_size = moe_intermediate_size + self.first_k_dense_replace = first_k_dense_replace + self.moe_layer_freq = moe_layer_freq + self.use_grouped_topk = use_grouped_topk + self.num_expert_group = num_expert_group + self.topk_group = topk_group + self.num_nextn_predict_layers = num_nextn_predict_layers + + if linear_attn_config is not None: + assert linear_attn_config["kda_layers"] is not None + assert linear_attn_config["full_attn_layers"] is not None + self.linear_attn_config = linear_attn_config + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + @property + def is_mla(self): + return ( + self.q_lora_rank is not None + or self.kv_lora_rank is not None + or self.qk_nope_head_dim is not None + or self.qk_rope_head_dim is not None + or self.v_head_dim is not None + or self.mla_use_nope is True + ) + + @property + def is_moe(self): + return self.num_experts is not None + + @property + def is_linear_attn(self) -> bool: + return not ( + self.linear_attn_config is None + or ( + isinstance(self.linear_attn_config, dict) + and self.linear_attn_config["kda_layers"] is not None + and len(self.linear_attn_config["kda_layers"]) == 0 + ) + ) + + def is_kda_layer(self, layer_idx: int): + return ( + self.linear_attn_config is not None + and (layer_idx + 1) in self.linear_attn_config["kda_layers"] + ) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 389baf1488..07d62e9849 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -811,8 +811,8 @@ def reorder_batch_to_split_decodes_and_prefills( num_computed_tokens_np = input_batch.num_computed_tokens_cpu[:num_reqs] is_decode = num_scheduled_tokens_np <= decode_threshold - is_extend = (~is_decode) & (num_computed_tokens_np > num_scheduled_tokens_np) - is_prefill = (~is_decode) & (num_computed_tokens_np == num_scheduled_tokens_np) + is_extend = (~is_decode) & (num_computed_tokens_np > 0) + is_prefill = (~is_decode) & (num_computed_tokens_np == 0) # Desired order: decode → extend → prefill req_regions = np.zeros(is_decode.shape, dtype=np.int32) # 0 = decode by default @@ -832,11 +832,11 @@ def reorder_batch_to_split_decodes_and_prefills( return False # Extract indices that need swapping and sort by target region - swap_indices = np.where(needs_swap)[0] + orig_indices = np.where(needs_swap)[0] sorted_order = np.argsort(req_regions[needs_swap], kind="stable") - dest_indices = swap_indices[sorted_order] + src_indices = orig_indices[sorted_order] - src_dest_map = {int(src): int(dst) for src, dst in zip(swap_indices, dest_indices)} + src_dest_map = {int(src): int(dst) for src, dst in zip(src_indices, orig_indices)} for src in src_dest_map: dst = src_dest_map[src] diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 035394f045..cc6b89e2bf 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -2,8 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass +from functools import cached_property from typing import TYPE_CHECKING +from typing_extensions import deprecated + from vllm._bc_linter import bc_linter_include if TYPE_CHECKING: @@ -96,16 +99,16 @@ class NewRequestData: @dataclass class CachedRequestData: req_ids: list[str] - # If resumed_from_preemption is False, new_block_ids will be appended to - # the request's block IDs. If True, new_block_ids will be used as the + # For request ids not in resumed_req_ids, new_block_ids will be appended to + # the request's block IDs. For those in the set, new_block_ids will be used as the # request's block IDs instead of appending to the existing block IDs. - resumed_from_preemption: list[bool] + resumed_req_ids: set[str] # NOTE(woosuk): new_token_ids is only used for pipeline parallelism. # When PP is not used, new_token_ids will be empty. new_token_ids: list[list[int]] - # If resumed_from_preemption is True, propogate the token ids to the - # connector, otherwise will be empty. - resumed_req_token_ids: list[list[int] | None] + # For requests not scheduled in the last step, propagate the token ids to the + # connector. Won't contain requests that were scheduled in the prior step. + all_token_ids: dict[str, list[int]] new_block_ids: list[tuple[list[int], ...] | None] num_computed_tokens: list[int] num_output_tokens: list[int] @@ -114,13 +117,26 @@ class CachedRequestData: def num_reqs(self) -> int: return len(self.req_ids) + @cached_property + @deprecated("use resumed_req_ids field") + def resumed_from_preemption(self) -> list[bool]: + return [req_id in self.resumed_req_ids for req_id in self.req_ids] + + @cached_property + @deprecated("use all_token_ids field") + def resumed_req_token_ids(self) -> list[list[int] | None]: + return [ + self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None + for req_id in self.req_ids + ] + @classmethod def make_empty(cls) -> "CachedRequestData": return cls( req_ids=[], - resumed_from_preemption=[], + resumed_req_ids=set(), new_token_ids=[], - resumed_req_token_ids=[], + all_token_ids={}, new_block_ids=[], num_computed_tokens=[], num_output_tokens=[], diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 14bdf29531..ad6fbee2ec 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -71,6 +71,7 @@ class Scheduler(SchedulerInterface): self.finished_req_ids_dict: dict[int, set[str]] | None = ( defaultdict(set) if include_finished_set else None ) + self.prev_step_scheduled_req_ids: set[str] = set() # Scheduling constraints. self.max_num_running_reqs = self.scheduler_config.max_num_seqs @@ -166,7 +167,7 @@ class Scheduler(SchedulerInterface): self.kv_cache_manager = KVCacheManager( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, - enable_caching=self.cache_config.enable_prefix_caching, + enable_caching=bool(self.cache_config.enable_prefix_caching), use_eagle=self.use_eagle, log_stats=self.log_stats, enable_kv_cache_events=self.enable_kv_cache_events, @@ -406,13 +407,13 @@ class Scheduler(SchedulerInterface): # Get externally-cached tokens if using a KVConnector. if self.connector is not None: - num_external_computed_tokens, load_kv_async = ( + ext_tokens, load_kv_async = ( self.connector.get_num_new_matched_tokens( request, num_new_local_computed_tokens ) ) - if num_external_computed_tokens is None: + if ext_tokens is None: # The request cannot be scheduled because # the KVConnector couldn't determine # the number of matched tokens. @@ -420,6 +421,8 @@ class Scheduler(SchedulerInterface): skipped_waiting_requests.prepend_request(request) continue + num_external_computed_tokens = ext_tokens + # Total computed tokens (local + external). num_computed_tokens = ( num_new_local_computed_tokens + num_external_computed_tokens @@ -444,14 +447,9 @@ class Scheduler(SchedulerInterface): # `request.num_prompt_tokens` to consider the resumed # requests, which have output tokens. num_new_tokens = request.num_tokens - num_computed_tokens - if ( - 0 - < self.scheduler_config.long_prefill_token_threshold - < num_new_tokens - ): - num_new_tokens = ( - self.scheduler_config.long_prefill_token_threshold - ) + threshold = self.scheduler_config.long_prefill_token_threshold + if 0 < threshold < num_new_tokens: + num_new_tokens = threshold # chunked prefill has to be enabled explicitly to allow # pooling requests to be chunked @@ -620,6 +618,11 @@ class Scheduler(SchedulerInterface): structured_output_request_ids, grammar_bitmask = self.get_grammar_bitmask( num_scheduled_tokens.keys(), scheduled_spec_decode_tokens ) + + # Record the request ids that were scheduled in this step. + self.prev_step_scheduled_req_ids.clear() + self.prev_step_scheduled_req_ids.update(num_scheduled_tokens.keys()) + scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, scheduled_cached_reqs=cached_reqs_data, @@ -646,23 +649,6 @@ class Scheduler(SchedulerInterface): meta = self.connector.build_connector_meta(scheduler_output) scheduler_output.kv_connector_metadata = meta - # collect KV cache events from KV cache manager - events = self.kv_cache_manager.take_events() - - # collect KV cache events from connector - if self.connector is not None: - connector_events = self.connector.take_events() - if connector_events: - if events is None: - events = list(connector_events) - else: - events.extend(connector_events) - - # publish collected KV cache events - if events: - batch = KVEventBatch(ts=time.time(), events=events) - self.kv_event_publisher.publish(batch) - self._update_after_schedule(scheduler_output) return scheduler_output @@ -708,14 +694,12 @@ class Scheduler(SchedulerInterface): req_ids: list[str] = [] new_token_ids: list[list[int]] = [] new_block_ids: list[tuple[list[int], ...] | None] = [] - resumed_req_token_ids: list[list[int] | None] = [] + all_token_ids: dict[str, list[int]] = {} num_computed_tokens: list[int] = [] num_output_tokens: list[int] = [] + resumed_req_ids = set() - # Because resumed_reqs is usually empty, it is more efficient to do - # in-place appending so that we don't need to allocate a new list. - resumed_from_preemption = [False] * len(running_reqs) - resumed_from_preemption += [True] * len(resumed_reqs) + num_running_reqs = len(running_reqs) for idx, req in enumerate(itertools.chain(running_reqs, resumed_reqs)): req_id = req.request_id req_ids.append(req_id) @@ -732,12 +716,14 @@ class Scheduler(SchedulerInterface): req.num_computed_tokens : req.num_computed_tokens + num_tokens ] new_token_ids.append(token_ids) - resumed_token_ids = None - if resumed_from_preemption[idx]: - resumed_token_ids = req.all_token_ids[ + scheduled_in_prev_step = req_id in self.prev_step_scheduled_req_ids + if idx >= num_running_reqs: + assert not scheduled_in_prev_step + resumed_req_ids.add(req_id) + if not scheduled_in_prev_step: + all_token_ids[req_id] = req.all_token_ids[ : req.num_computed_tokens + num_tokens ] - resumed_req_token_ids.append(resumed_token_ids) new_block_ids.append( req_to_new_blocks[req_id].get_block_ids(allow_none=True) ) @@ -748,9 +734,9 @@ class Scheduler(SchedulerInterface): return CachedRequestData( req_ids=req_ids, - resumed_from_preemption=resumed_from_preemption, + resumed_req_ids=resumed_req_ids, new_token_ids=new_token_ids, - resumed_req_token_ids=resumed_req_token_ids, + all_token_ids=all_token_ids, new_block_ids=new_block_ids, num_computed_tokens=num_computed_tokens, num_output_tokens=num_output_tokens, @@ -921,13 +907,13 @@ class Scheduler(SchedulerInterface): outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) spec_decoding_stats: SpecDecodingStats | None = None - kv_connector_stats = ( + kv_connector_stats: KVConnectorStats | None = ( kv_connector_output.kv_connector_stats if kv_connector_output else None ) if kv_connector_stats and self.connector: - stats = self.connector.get_kv_connector_stats() - if stats: - kv_connector_stats = kv_connector_stats.aggregate(stats) + kv_stats = self.connector.get_kv_connector_stats() + if kv_stats: + kv_connector_stats = kv_connector_stats.aggregate(kv_stats) failed_kv_load_req_ids = None if kv_connector_output and kv_connector_output.invalid_block_ids: @@ -1057,6 +1043,23 @@ class Scheduler(SchedulerInterface): if kv_connector_output: self._update_from_kv_xfer_finished(kv_connector_output) + # collect KV cache events from KV cache manager + events = self.kv_cache_manager.take_events() + + # collect KV cache events from connector + if self.connector is not None: + connector_events = self.connector.take_events() + if connector_events: + if events is None: + events = list(connector_events) + else: + events.extend(connector_events) + + # publish collected KV cache events + if events: + batch = KVEventBatch(ts=time.time(), events=events) + self.kv_event_publisher.publish(batch) + # Create EngineCoreOutputs for all clients that have requests with # outputs in this step. engine_core_outputs = { diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 6699fb9818..575ae3d7d8 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -394,7 +394,13 @@ class SlidingWindowManager(SingleTypeKVCacheManager): # skipped during the attention computation. last_useful_token = num_computed_tokens - self.sliding_window + 1 last_useful_block = last_useful_token // self.block_size + if last_useful_block <= 0: + # Early return if tokens are not enough to fill the sliding window + return blocks = self.req_to_blocks[request_id] + if blocks[last_useful_block - 1] == self._null_block: + # Early return if there are no blocks to remove + return removed_blocks: list[KVCacheBlock] = [] for i in range(last_useful_block - 1, -1, -1): if blocks[i] == self._null_block: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index cf458a8f07..dc61d45015 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -6,7 +6,7 @@ import socket import time from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy -from typing import Any +from typing import Any, cast import numpy as np import torch @@ -131,10 +131,9 @@ class AsyncLLM(EngineClient): self.output_processor = OutputProcessor( self.tokenizer, log_stats=self.log_stats ) - if self.observability_config.otlp_traces_endpoint is not None: - tracer = init_tracer( - "vllm.llm_engine", self.observability_config.otlp_traces_endpoint - ) + endpoint = self.observability_config.otlp_traces_endpoint + if endpoint is not None: + tracer = init_tracer("vllm.llm_engine", endpoint) self.output_processor.tracer = tracer # EngineCore (starts the engine in background process). @@ -266,7 +265,9 @@ class AsyncLLM(EngineClient): if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() - cancel_task_threadsafe(getattr(self, "output_handler", None)) + handler = getattr(self, "output_handler", None) + if handler is not None: + cancel_task_threadsafe(handler) async def get_supported_tasks(self) -> tuple[SupportedTask, ...]: return await self.engine_core.get_supported_tasks_async() @@ -314,7 +315,10 @@ class AsyncLLM(EngineClient): priority, data_parallel_rank, ) - prompt_text = prompt if isinstance(prompt, str) else prompt.get("prompt") + if isinstance(prompt, str): + prompt_text = prompt + elif isinstance(prompt, Mapping): + prompt_text = cast(str | None, prompt.get("prompt")) if is_pooling or params.n == 1: await self._add_request(request, prompt_text, None, 0, queue) @@ -436,6 +440,7 @@ class AsyncLLM(EngineClient): # Note: both OutputProcessor and EngineCore handle their # own request cleanup based on finished. finished = out.finished + assert isinstance(out, RequestOutput) yield out # If the request is disconnected by the client, generate() @@ -653,7 +658,7 @@ class AsyncLLM(EngineClient): return self.tokenizer async def is_tracing_enabled(self) -> bool: - return self.observability_config.otlp_traces_endpoint is not None + return self.observability_config.otlp_traces_endpoint is not None # type: ignore async def do_log_stats(self) -> None: if self.logger_manager: @@ -689,9 +694,15 @@ class AsyncLLM(EngineClient): await self.reset_prefix_cache() await self.engine_core.sleep_async(level) + if self.logger_manager is not None: + self.logger_manager.record_sleep_state(1, level) + async def wake_up(self, tags: list[str] | None = None) -> None: await self.engine_core.wake_up_async(tags) + if self.logger_manager is not None: + self.logger_manager.record_sleep_state(0, 0) + async def is_sleeping(self) -> bool: return await self.engine_core.is_sleeping_async() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 85cab32ebf..6cbd986b3c 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1075,6 +1075,7 @@ class DPEngineCoreProc(EngineCoreProc): local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local assert dp_size > 1 + assert local_dp_rank is not None assert 0 <= local_dp_rank <= dp_rank < dp_size if vllm_config.kv_transfer_config is not None: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 7b554ca991..9b440505bd 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -385,10 +385,11 @@ class BackgroundResources: with contextlib.suppress(Exception): task.cancel() - if in_loop(loop): - close_sockets_and_tasks() - elif loop and not loop.is_closed(): - loop.call_soon_threadsafe(close_sockets_and_tasks) + if loop is not None: + if in_loop(loop): + close_sockets_and_tasks() + elif not loop.is_closed(): + loop.call_soon_threadsafe(close_sockets_and_tasks) else: # Loop has been closed, try to clean up directly. del tasks @@ -1044,6 +1045,7 @@ class DPAsyncMPClient(AsyncMPClient): return assert self.stats_update_address is not None + stats_addr: str = self.stats_update_address assert len(self.engine_ranks_managed) > 0 # NOTE: running and waiting counts are all global from # the Coordinator include all global EngineCores. This @@ -1054,9 +1056,7 @@ class DPAsyncMPClient(AsyncMPClient): async def run_engine_stats_update_task(): with ( - make_zmq_socket( - self.ctx, self.stats_update_address, zmq.XSUB, linger=0 - ) as socket, + make_zmq_socket(self.ctx, stats_addr, zmq.XSUB, linger=0) as socket, make_zmq_socket( self.ctx, self.first_req_sock_addr, zmq.PAIR, bind=False, linger=0 ) as first_req_rcv_socket, diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 5f66e36893..b7a24096bf 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -69,14 +69,21 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): # Stop strings params = request.sampling_params assert params is not None - self.stop = stop = params.stop + stop_list: list[str] + if params.stop is None: + stop_list = [] + elif isinstance(params.stop, str): + stop_list = [params.stop] + else: + stop_list = params.stop + self.stop = stop_list self.min_tokens = params.min_tokens self.include_stop_str_in_output = params.include_stop_str_in_output # Number of chars to hold back when stop strings are to be excluded # from streamed output. - if stop and not self.include_stop_str_in_output: - self.stop_buffer_length = max(len(s) for s in stop) - 1 + if self.stop and not self.include_stop_str_in_output: + self.stop_buffer_length = max(len(s) for s in self.stop) - 1 else: self.stop_buffer_length = 0 self._last_output_text_offset: int = 0 diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 486dacb2e5..c2ca9579d5 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -4,7 +4,7 @@ import time from collections.abc import Callable, Mapping from copy import copy -from typing import Any +from typing import Any, cast import torch.nn as nn from typing_extensions import TypeVar @@ -112,10 +112,9 @@ class LLMEngine: self.output_processor = OutputProcessor( self.tokenizer, log_stats=self.log_stats ) - if self.observability_config.otlp_traces_endpoint is not None: - tracer = init_tracer( - "vllm.llm_engine", self.observability_config.otlp_traces_endpoint - ) + endpoint = self.observability_config.otlp_traces_endpoint + if endpoint is not None: + tracer = init_tracer("vllm.llm_engine", endpoint) self.output_processor.tracer = tracer # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) @@ -259,7 +258,10 @@ class LLMEngine: trace_headers, priority, ) - prompt_text = prompt if isinstance(prompt, str) else prompt.get("prompt") + if isinstance(prompt, str): + prompt_text = prompt + elif isinstance(prompt, Mapping): + prompt_text = cast(str | None, prompt.get("prompt")) n = params.n if isinstance(params, SamplingParams) else 1 @@ -285,7 +287,7 @@ class LLMEngine: # Add the request to EngineCore. self.engine_core.add_request(child_request) - def step(self) -> list[RequestOutput] | list[PoolingRequestOutput]: + def step(self) -> list[RequestOutput | PoolingRequestOutput]: if self.should_execute_dummy_batch: self.should_execute_dummy_batch = False self.engine_core.execute_dummy_batch() @@ -332,9 +334,15 @@ class LLMEngine: def sleep(self, level: int = 1): self.engine_core.sleep(level) + if self.logger_manager is not None: + self.logger_manager.record_sleep_state(1, level) + def wake_up(self, tags: list[str] | None = None): self.engine_core.wake_up(tags) + if self.logger_manager is not None: + self.logger_manager.record_sleep_state(0, 0) + def is_sleeping(self) -> bool: return self.engine_core.is_sleeping() diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 44e4eadce4..07c8113dd9 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -44,10 +44,16 @@ class RequestOutputCollector: if self.output is None or isinstance(output, Exception): self.output = output self.ready.set() - elif isinstance(self.output, (RequestOutput, PoolingRequestOutput)): + elif isinstance(self.output, RequestOutput) and isinstance( + output, RequestOutput + ): # This ensures that request outputs with different request indexes # (if n > 1) do not override each other. self.output.add(output, aggregate=self.aggregate) + elif isinstance(self.output, PoolingRequestOutput) and isinstance( + output, PoolingRequestOutput + ): + self.output = output async def get(self) -> RequestOutput | PoolingRequestOutput: """Get operation blocks on put event.""" @@ -408,7 +414,7 @@ class OutputProcessor: within the loop below. """ - request_outputs: list[RequestOutput] | list[PoolingRequestOutput] = [] + request_outputs: list[RequestOutput | PoolingRequestOutput] = [] reqs_to_abort: list[str] = [] for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py index 2a47befec2..26ee10d2b9 100644 --- a/vllm/v1/engine/parallel_sampling.py +++ b/vllm/v1/engine/parallel_sampling.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import copy -from typing import Optional +from typing import Optional, cast from vllm.outputs import CompletionOutput from vllm.sampling_params import RequestOutputKind, SamplingParams @@ -37,7 +37,7 @@ class ParentRequest: self.child_requests = set() self.output_aggregator = ( - [None] * sampling_params.n + [cast(CompletionOutput, None)] * sampling_params.n if (sampling_params.output_kind == RequestOutputKind.FINAL_ONLY) else [] ) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index de15677aee..c49fd1bde8 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -3,7 +3,7 @@ import time from collections.abc import Mapping -from typing import Any, Literal +from typing import Any, Literal, cast from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs @@ -208,9 +208,9 @@ class Processor: enc = prompt.get("encoder_prompt") dec = prompt.get("decoder_prompt") if enc is not None: - _validate_single_prompt(enc) + _validate_single_prompt(cast(dict | str, enc)) if dec is not None: - _validate_single_prompt(dec) + _validate_single_prompt(cast(dict | str, dec)) else: _validate_single_prompt(prompt) # type: ignore[arg-type] @@ -332,7 +332,7 @@ class Processor: if not mm_data: return None - mm_uuids: MultiModalUUIDDict = {} + mm_uuids: dict[str, list[str | None] | str] = {} for modality, data in mm_data.items(): n = len(data) if isinstance(data, list) else 1 mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)] @@ -384,7 +384,9 @@ class Processor: # if provided. self._validate_multi_modal_uuids(prompt) if isinstance(prompt, dict): - mm_uuids = prompt.get("multi_modal_uuids") + mm_uuids = cast( + MultiModalUUIDDict | None, prompt.get("multi_modal_uuids") + ) else: mm_uuids = None @@ -410,20 +412,13 @@ class Processor: encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) self._validate_model_inputs(encoder_inputs, decoder_inputs) - # Mypy does not always properly infer the types of some elements of - # discriminated unions of TypedDicts, because of how it handles - # inheritance of TypedDict. If we explicitly extract the items we want - # we can avoid type errors from using `dict.get` later in the method. - prompt_token_ids = ( - decoder_inputs["prompt_token_ids"] - if decoder_inputs["type"] != "embeds" - else None - ) - prompt_embeds = ( - decoder_inputs["prompt_embeds"] - if decoder_inputs["type"] == "embeds" - else None - ) + # Mypy can be conservative for TypedDict unions; normalize access. + if decoder_inputs["type"] == "embeds": + prompt_token_ids = None + prompt_embeds = decoder_inputs["prompt_embeds"] + else: + prompt_token_ids = decoder_inputs["prompt_token_ids"] + prompt_embeds = None sampling_params = None pooling_params = None diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py index 518f1582fa..382f008266 100644 --- a/vllm/v1/executor/ray_utils.py +++ b/vllm/v1/executor/ray_utils.py @@ -255,12 +255,33 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): try: ray.get(pg_ready_ref, timeout=0) except ray.exceptions.GetTimeoutError: - raise ValueError( - "Cannot provide a placement group of " - f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " - "`ray status` and `ray list nodes` to make sure the cluster has " - "enough resources." - ) from None + # Provide more helpful error message when GPU count is exceeded + total_gpu_required = sum(spec.get("GPU", 0) for spec in placement_group_specs) + # If more than one GPU is required for the placement group, provide a + # more specific error message. + # We use >1 here because multi-GPU (tensor parallel) jobs are more + # likely to fail due to insufficient cluster resources, and users may + # need to adjust tensor_parallel_size to fit available GPUs. + if total_gpu_required > 1: + raise ValueError( + f"Cannot provide a placement group requiring " + f"{total_gpu_required} GPUs " + f"(placement_group_specs={placement_group_specs}) within " + f"{PG_WAIT_TIMEOUT} seconds.\n" + f"Tensor parallel size may exceed available GPUs in your " + f"cluster. Check resources with `ray status` and " + f"`ray list nodes`.\n" + f"If running on K8s with limited GPUs, consider reducing " + f"--tensor-parallel-size to match available GPU resources." + ) from None + else: + raise ValueError( + "Cannot provide a placement group of " + f"{placement_group_specs=} within " + f"{PG_WAIT_TIMEOUT} seconds. See " + "`ray status` and `ray list nodes` to make sure the cluster " + "has enough resources." + ) from None def _wait_until_pg_removed(current_placement_group: "PlacementGroup"): @@ -299,6 +320,23 @@ def initialize_ray_cluster( assert_ray_available() from vllm.platforms import current_platform + # Prevalidate GPU requirements before Ray processing + if current_platform.is_cuda() and parallel_config.world_size > 1: + from vllm.utils import cuda_device_count_stateless + + available_gpus = cuda_device_count_stateless() + if parallel_config.world_size > available_gpus: + logger.warning( + "Tensor parallel size (%d) exceeds available GPUs (%d). " + "This may result in Ray placement group allocation failures. " + "Consider reducing tensor_parallel_size to %d or less, " + "or ensure your Ray cluster has %d GPUs available.", + parallel_config.world_size, + available_gpus, + available_gpus, + parallel_config.world_size, + ) + if ray.is_initialized(): logger.info("Ray is already initialized. Skipping Ray initialization.") elif current_platform.is_rocm() or current_platform.is_xpu(): diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py index 250ed5e95a..f765d19ea0 100644 --- a/vllm/v1/kv_offload/cpu.py +++ b/vllm/v1/kv_offload/cpu.py @@ -51,9 +51,9 @@ class CPUOffloadingSpec(OffloadingSpec): self, kv_caches: dict[str, torch.Tensor] ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: if not self._handler: - if not current_platform.is_cuda(): + if not current_platform.is_cuda_alike(): raise Exception( - "CPU Offloading is currently only supported on CUDA GPUs" + "CPU Offloading is currently only supported on CUDA-alike GPUs" ) layer_names = list(kv_caches.keys()) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index c5d7885eef..3772f07066 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -9,8 +9,12 @@ from typing import TypeAlias from prometheus_client import Counter, Gauge, Histogram +import vllm.envs as envs from vllm.config import SupportsMetricsInfo, VllmConfig -from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorLogging +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorLogging, + KVConnectorPrometheus, +) from vllm.logger import init_logger from vllm.plugins import load_plugins_by_group from vllm.v1.engine import FinishReason @@ -56,6 +60,9 @@ class StatLoggerBase(ABC): def log(self): # noqa pass + def record_sleep_state(self, is_awake: int, level: int): # noqa + pass + def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]: factories: list[StatLoggerFactory] = [] @@ -335,6 +342,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase): _counter_cls = Counter _histogram_cls = Histogram _spec_decoding_cls = SpecDecodingProm + _kv_connector_cls = KVConnectorPrometheus def __init__( self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None @@ -354,12 +362,15 @@ class PrometheusStatLogger(AggregateStatLoggerBase): model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len - spec_decode_labelvalues: dict[int, list[str]] = { + per_engine_labelvalues: dict[int, list[str]] = { idx: [model_name, str(idx)] for idx in engine_indexes } self.spec_decoding_prom = self._spec_decoding_cls( - vllm_config.speculative_config, labelnames, spec_decode_labelvalues + vllm_config.speculative_config, labelnames, per_engine_labelvalues + ) + self.kv_connector_prom = self._kv_connector_cls( + vllm_config, labelnames, per_engine_labelvalues ) # @@ -384,8 +395,33 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.gauge_scheduler_waiting = make_per_engine( gauge_scheduler_waiting, engine_indexes, model_name ) + if envs.VLLM_SERVER_DEV_MODE: + gauge_engine_sleep_state = self._gauge_cls( + name="vllm:engine_sleep_state", + documentation=( + "Engine sleep state; awake = 0 means engine is sleeping; " + "awake = 1 means engine is awake; " + "weights_offloaded = 1 means sleep level 1; " + "discard_all = 1 means sleep level 2." + ), + labelnames=labelnames + ["sleep_state"], + multiprocess_mode="mostrecent", + ) + + self.gauge_engine_sleep_state = {} + sleep_state = ["awake", "weights_offloaded", "discard_all"] + + for s in sleep_state: + self.gauge_engine_sleep_state[s] = { + idx: gauge_engine_sleep_state.labels( + engine=idx, model_name=model_name, sleep_state=s + ) + for idx in engine_indexes + } + + # Setting default values + self.record_sleep_state() - # # GPU cache # # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc @@ -933,6 +969,11 @@ class PrometheusStatLogger(AggregateStatLoggerBase): scheduler_stats.spec_decoding_stats, engine_idx ) + if scheduler_stats.kv_connector_stats is not None: + self.kv_connector_prom.observe( + scheduler_stats.kv_connector_stats, engine_idx + ) + if mm_cache_stats is not None: self.counter_mm_cache_queries[engine_idx].inc(mm_cache_stats.queries) self.counter_mm_cache_hits[engine_idx].inc(mm_cache_stats.hits) @@ -1010,6 +1051,25 @@ class PrometheusStatLogger(AggregateStatLoggerBase): } self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time() + def record_sleep_state(self, sleep: int = 0, level: int = 0): + awake = 1 + discard_all = 0 + weights_offloaded = 0 + + if sleep == 1: + awake = 0 + if level == 1: + weights_offloaded = 1 + elif level == 2: + discard_all = 1 + + for engine_idx in self.engine_indexes: + self.gauge_engine_sleep_state["discard_all"][engine_idx].set(discard_all) + self.gauge_engine_sleep_state["weights_offloaded"][engine_idx].set( + weights_offloaded + ) + self.gauge_engine_sleep_state["awake"][engine_idx].set(awake) + def log_engine_initialized(self): self.log_metrics_info("cache_config", self.vllm_config.cache_config) @@ -1131,6 +1191,10 @@ class StatLoggerManager: engine_idx=engine_idx, ) + def record_sleep_state(self, sleep: int = 0, level: int = 0): + for logger in self.stat_loggers: + logger.record_sleep_state(sleep, level) + def log(self): for logger in self.stat_loggers: logger.log() diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index b845852a0c..a319ffb1d2 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus from vllm.v1.metrics.loggers import PrometheusStatLogger from vllm.v1.spec_decode.metrics import SpecDecodingProm @@ -141,6 +142,18 @@ class RaySpecDecodingProm(SpecDecodingProm): _counter_cls = RayCounterWrapper +class RayKVConnectorPrometheus(KVConnectorPrometheus): + """ + RayKVConnectorPrometheus is used by RayMetrics to log Ray + metrics. Provides the same metrics as KV connectors but + uses Ray's util.metrics library. + """ + + _gauge_cls = RayGaugeWrapper + _counter_cls = RayCounterWrapper + _histogram_cls = RayHistogramWrapper + + class RayPrometheusStatLogger(PrometheusStatLogger): """RayPrometheusStatLogger uses Ray metrics instead.""" @@ -148,6 +161,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger): _counter_cls = RayCounterWrapper _histogram_cls = RayHistogramWrapper _spec_decoding_cls = RaySpecDecodingProm + _kv_connector_cls = RayKVConnectorPrometheus @staticmethod def _unregister_vllm_metrics(): diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 10f97576b6..e7122ba339 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -59,6 +59,15 @@ class LogprobsTensors(NamedTuple): cu_num_generated_tokens, ) + def to_cpu_nonblocking(self) -> "LogprobsTensors": + if self.logprob_token_ids.device.type == "cpu": + return self + return LogprobsTensors( + self.logprob_token_ids.to("cpu", non_blocking=True), + self.logprobs.to("cpu", non_blocking=True), + self.selected_token_ranks.to("cpu", non_blocking=True), + ) + @staticmethod def empty_cpu( num_positions: int, num_tokens_per_position: int diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 129d7e5446..729ce462cf 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -8,6 +8,7 @@ from collections import defaultdict from collections.abc import Iterator from contextlib import contextmanager from copy import deepcopy +from functools import reduce from itertools import product from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, cast @@ -164,6 +165,7 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): self, model_runner_output: ModelRunnerOutput, sampled_token_ids: torch.Tensor, + logprobs_tensors: torch.Tensor | None, invalid_req_indices: list[int], async_output_copy_stream: torch.cuda.Stream, ): @@ -176,6 +178,7 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): # Keep a reference to the device tensor to avoid it being # deallocated until we finish copying it to the host. self._sampled_token_ids = sampled_token_ids + self._logprobs_tensors = logprobs_tensors # Initiate the copy on a separate stream, but do not synchronize it. default_stream = torch.cuda.current_stream() @@ -184,6 +187,11 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): self.sampled_token_ids_cpu = self._sampled_token_ids.to( "cpu", non_blocking=True ) + self._logprobs_tensors_cpu = ( + self._logprobs_tensors.to_cpu_nonblocking() + if self._logprobs_tensors + else None + ) self.async_copy_ready_event.record() def get_output(self) -> ModelRunnerOutput: @@ -193,7 +201,8 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): """ self.async_copy_ready_event.synchronize() - # Release the device tensor once the copy has completed + # Release the device tensors once the copy has completed. + del self._logprobs_tensors del self._sampled_token_ids valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist() @@ -202,6 +211,10 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): output = self._model_runner_output output.sampled_token_ids = valid_sampled_token_ids + if self._logprobs_tensors_cpu: + # NOTE(nick): this will need to be updated to use cu_num_accepted_tokens + # for async sched + spec decode + logprobs compatibility. + output.logprobs = self._logprobs_tensors_cpu.tolists() return output @@ -694,7 +707,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = self.requests[req_id] num_computed_tokens = req_data.num_computed_tokens[i] new_block_ids = req_data.new_block_ids[i] - resumed_from_preemption = req_data.resumed_from_preemption[i] + resumed_from_preemption = req_id in req_data.resumed_req_ids num_output_tokens = req_data.num_output_tokens[i] # Update the cached states. @@ -742,16 +755,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Replace the existing block IDs with the new ones. req_state.block_ids = new_block_ids - if self.use_async_scheduling and num_output_tokens > 0: - # We must recover the output token ids for resumed requests in the - # async scheduling case, so that correct input_ids are obtained. - resumed_token_ids = req_data.resumed_req_token_ids[i] - assert resumed_token_ids is not None - req_state.output_token_ids = resumed_token_ids[-num_output_tokens:] if req_index is None: # The request is not in the persistent batch. # The request was either preempted and resumed later, or was not # scheduled in the previous step and needs to be added again. + + if self.use_async_scheduling and num_output_tokens > 0: + # We must recover the output token ids for resumed requests in the + # async scheduling case, so that correct input_ids are obtained. + resumed_token_ids = req_data.all_token_ids[req_id] + req_state.output_token_ids = resumed_token_ids[-num_output_tokens:] + reqs_to_add.append(req_state) continue @@ -2334,11 +2348,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cu_num_accepted_tokens[-1] + len(sampled_ids) ) - # NOTE: GPU -> CPU Sync happens here. - # Move as many CPU operations as possible before this sync point. logprobs_lists = ( logprobs_tensors.tolists(cu_num_accepted_tokens) - if logprobs_tensors is not None + if not self.use_async_scheduling and logprobs_tensors is not None else None ) @@ -2664,6 +2676,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): async_output = AsyncGPUModelRunnerOutput( model_runner_output=output, sampled_token_ids=sampler_output.sampled_token_ids, + logprobs_tensors=sampler_output.logprobs_tensors, invalid_req_indices=invalid_req_indices, async_output_copy_stream=self.async_output_copy_stream, ) @@ -4122,26 +4135,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def calculate_reorder_batch_threshold(self) -> None: """ - Check that if any backends reorder batches; that the reordering - is compatible (e.g., decode threshold is the same) + Choose the minimum reorder batch threshold from all attention groups. + Backends should be able to support lower threshold then what they request + just may have a performance penalty due to that backend treating decodes + as prefills. """ - for group in self._attn_group_iterator(): - attn_metadata_builder_i = group.get_metadata_builder() + min_none_high = lambda a, b: a if b is None else b if a is None else min(a, b) - # check that if any backends reorder batches; that the reordering - # is compatible (e.g., decode threshold is the same) - reorder_batch_threshold_i = attn_metadata_builder_i.reorder_batch_threshold - if reorder_batch_threshold_i is not None: - if self.reorder_batch_threshold is not None: - if reorder_batch_threshold_i != self.reorder_batch_threshold: - raise ValueError( - f"Attention backend reorders decodes with " - f"threshold {reorder_batch_threshold_i} but other " - f"backend uses threshold " - f"{self.reorder_batch_threshold}" - ) - else: - self.reorder_batch_threshold = reorder_batch_threshold_i + reorder_batch_thresholds = [ + group.get_metadata_builder().reorder_batch_threshold + for group in self._attn_group_iterator() + ] + self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds) def _find_compatible_block_sizes( self, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 5d7b181989..0ced138b94 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -483,7 +483,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = self.requests[req_id] num_computed_tokens = req_data.num_computed_tokens[i] new_block_ids = req_data.new_block_ids[i] - resumed_from_preemption = req_data.resumed_from_preemption[i] + resumed_from_preemption = req_id in req_data.resumed_req_ids # Update the cached states. req_state.num_computed_tokens = num_computed_tokens diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py index 6edcb78486..9f16b1e6d0 100644 --- a/vllm/v1/worker/ubatching.py +++ b/vllm/v1/worker/ubatching.py @@ -185,6 +185,15 @@ def dbo_register_recv_hook(recv_hook): next_ctx.recv_hook = recv_hook +def dbo_get_previous_event(func, *args, **kwargs): + if len(_THREAD_ID_TO_CONTEXT) > 0: + ctx_idx = _THREAD_ID_TO_CONTEXT[threading.get_ident()] + ctx = _CURRENT_CONTEXTS[ctx_idx] + # execute callable on the ubatch compute stream to record/wait events there + with torch.cuda.stream(ctx.compute_stream): + return func(*args, **kwargs) + + def make_ubatch_contexts( num_micro_batches: int, compute_stream: torch.cuda.Stream,