fix

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
add truncation
2025-10-11 11:38:33 -07:00 · 2025-10-11 11:20:31 -07:00 · 2025-10-11 11:09:22 -07:00 · 2025-08-23 21:09:20 -07:00
1150 changed files with 14829 additions and 23579 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -5,11 +5,11 @@ import os
 import sys
 import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
 # Note that we have 800 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
 def print_top_10_largest_files(zip_file):
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.419
  - name: "exact_match,flexible-extract"
    value: 0.416
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@ -1,11 +0,0 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 backend: "vllm-vlm"
 tasks:
 - name: "chartqa"
  metrics:
  - name: "relaxed_accuracy,none"
    value: 0.90
 limit: 100
 num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -1,11 +0,0 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 backend: "vllm-vlm"
 tasks:
 - name: "mmlu_pro"
  metrics:
  - name: "exact_match,custom-extract"
    value: 0.80
 limit: 250 # will run on 250 * 14 subjects = 3500 samples
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,5 +1,4 @@
-# For vllm script, with -t option (tensor parallel size)
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@ -1,12 +0,0 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
 model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
 backend: "vllm-vlm"
 tasks:
 - name: "chartqa"
  metrics:
  - name: "relaxed_accuracy,none"
    value: 0.855
 limit: 2500
 num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
@ -1 +0,0 @@
 Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@ -1 +0,0 @@
 Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@ -1 +0,0 @@
 Qwen2.5-VL-7B-Instruct.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@ -1,44 +0,0 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install lm-eval==0.4.9
 usage() {
    echo``
    echo "Runs lm eval harness on ChartQA using multimodal vllm."
    echo "This pathway is intended to be used to create baselines for "
    echo "our correctness tests in vllm's CI."
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -l    - limit number of samples to run"
    echo "  -t    - tensor parallel size to run at"
    echo
 }
 while getopts "m:l:t:" OPT; do
  case ${OPT} in
    m ) 
        MODEL="$OPTARG"
        ;;
    l ) 
        LIMIT="$OPTARG"
        ;;
    t ) 
        TP_SIZE="$OPTARG"
        ;;
    \? ) 
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model vllm-vlm \
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@ -1,50 +0,0 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on MMLUPRO for vllm.
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 usage() {
    echo``
    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
    echo "This pathway is intended to be used to create baselines for "
    echo "our automated nm-test-accuracy workflow"
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -l    - limit number of samples to run"
    echo "  -f    - number of fewshot samples to use"
    echo "  -t    - tensor parallel size to run at"
    echo
 }
 while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
    b )
        BATCH_SIZE="$OPTARG"
        ;;
    l )
        LIMIT="$OPTARG"
        ;;
    f )
        FEWSHOT="$OPTARG"
        ;;
    t )
        TP_SIZE="$OPTARG"
        ;;
    \? )
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model vllm \
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size auto
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -19,27 +19,21 @@ RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
    batch_size = eval_config.get("batch_size", "auto")
    backend = eval_config.get("backend", "vllm")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
        f"enforce_eager=true,"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len},"
+        f"max_model_len={max_model_len}"
    )
    results = lm_eval.simple_evaluate(
-        model=backend,
+        model="vllm",
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+        batch_size="auto",
        # text models. however, this is regressing measured strict-match for
        # existing text models in CI, so only apply it for mm.
        apply_chat_template=backend == "vllm-vlm",
        batch_size=batch_size,
    )
    return results
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -8,7 +8,7 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@ -76,7 +76,7 @@ steps:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
  # Add job to create multi-arch manifest
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -527,8 +527,7 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  # TODO(jerryzh168): resolve the above comment
+  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
  - uv pip install --system torchao==0.13.0
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
 - label: LM Eval Small Models # 53min
@ -734,16 +733,6 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
--- a/.coveragerc
+++ b/.coveragerc
@ -1,10 +1,5 @@
 [run]
-# Track the installed vllm package (this is what actually gets imported during tests)
+source = vllm
 # Use wildcard pattern to match the installed location
 source =
    vllm
    */dist-packages/vllm
    */site-packages/vllm
 omit =
    */tests/*
    */test_*
@ -17,16 +12,6 @@ omit =
    */benchmarks/*
    */docs/*
 [paths]
 # Map all possible vllm locations to a canonical "vllm" path
 # This ensures coverage.combine properly merges data from different test runs
 source =
    vllm
    /vllm-workspace/src/vllm
    /vllm-workspace/vllm
    */site-packages/vllm
    */dist-packages/vllm
 [report]
 exclude_lines =
    pragma: no cover
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -1,4 +0,0 @@
 # Migrate from `yapf` & `isort` to `ruff`
 d6953beb91da4e9c99be4c0a1304a2d24189535c
 # Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
 8fcaaf6a165e661f63fc51be906bc05b0767332f
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,7 +5,9 @@
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
@ -24,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
@ -118,11 +121,3 @@ mkdocs.yaml @hmellor
 # KVConnector installation files
 /requirements/kv_connectors.txt @NickLucche
 # Pooling models
 /examples/*/pooling/ @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -13,7 +13,6 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
        id: label-step
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
@ -43,6 +42,7 @@ jobs:
                    searchIn: "body"
                  },
                ],
                // Substring search - matches anywhere in text (partial matches)
                substrings: [
                  {
@ -89,12 +89,14 @@ jobs:
                    term: "hip_",
                    searchIn: "both"
                  },
                  // ROCm tools and libraries
                  {
                    term: "hipify",
                    searchIn: "both"
                  },
                ],
                // Regex patterns - for complex pattern matching
                regexPatterns: [
                  {
@ -105,17 +107,13 @@ jobs:
                  }
                ],
              },
              // Add more label configurations here as needed
              // example: {
              //   keywords: [...],
              //   substrings: [...],
              //   regexPatterns: [...]
              // },
            };
            // Helper function to create regex based on search type
            function createSearchRegex(term, type) {
              // Escape special regex characters in the term
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
              switch (type) {
                case 'keyword':
                  // Word boundary search - matches whole words only
@ -127,13 +125,16 @@ jobs:
                  throw new Error(`Unknown search type: ${type}`);
              }
            }
            // Helper function to find matching terms in text with line information
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
              const matches = [];
              const lines = text.split('\n');
              for (const termConfig of searchTerms) {
                let regex;
                let term, searchIn, pattern, description, flags;
                // Handle different input formats (string or object)
                if (typeof termConfig === 'string') {
                  term = termConfig;
@ -145,17 +146,21 @@ jobs:
                  description = termConfig.description;
                  flags = termConfig.flags;
                }
                // Skip if this term shouldn't be searched in the current location
                if (searchIn !== 'both' && searchIn !== searchLocation) {
                  continue;
                }
                // Create appropriate regex
                if (searchType === 'regex') {
                  regex = new RegExp(pattern, flags || "gi");
                } else {
                  regex = createSearchRegex(term, searchType);
                }
                const termMatches = [];
                // Check each line for matches
                lines.forEach((line, lineIndex) => {
                  const lineMatches = line.match(regex);
@ -170,14 +175,15 @@ jobs:
                        originalTerm: term || pattern,
                        description: description,
                        // Show context around the match in the line
-                        context: line.length > 100 ?
+                        context: line.length > 100 ? 
-                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
-                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
                          : line.trim()
                      });
                    });
                  }
                });
                if (termMatches.length > 0) {
                  matches.push({
                    term: term || (description || pattern),
@ -190,48 +196,64 @@ jobs:
                  });
                }
              }
              return matches;
            }
            // Helper function to check if label should be added
            async function processLabel(labelName, config) {
              const body = context.payload.issue.body || "";
              const title = context.payload.issue.title || "";
              core.notice(`Processing label: ${labelName}`);
              core.notice(`Issue Title: "${title}"`);
              core.notice(`Issue Body length: ${body.length} characters`);
              let shouldAddLabel = false;
              let allMatches = [];
              let reason = '';
              const keywords = config.keywords || [];
              const substrings = config.substrings || [];
              const regexPatterns = config.regexPatterns || [];
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
              // Search in title
              if (title.trim()) {
                core.notice(`Searching in title: "${title}"`);
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
              }
              // Search in body
              if (body.trim()) {
                core.notice(`Searching in body (${body.length} characters)`);
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
              }
              if (allMatches.length > 0) {
                core.notice(`Found ${allMatches.length} matching term(s):`);
                for (const termMatch of allMatches) {
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
                  if (termMatch.searchType === 'regex') {
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  } else {
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  }
                  // Show details for each match
                  termMatch.matches.forEach((match, index) => {
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -244,6 +266,7 @@ jobs:
                    }
                  });
                }
                shouldAddLabel = true;
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -251,10 +274,13 @@ jobs:
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
              }
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
              if (shouldAddLabel) {
                const existingLabels = context.payload.issue.labels.map(l => l.name);
                if (!existingLabels.includes(labelName)) {
@ -270,92 +296,14 @@ jobs:
                core.notice(`Label "${labelName}" already present.`);
                return false;
              }
              core.notice(`No matching terms found for label "${labelName}".`);
              return false;
            }
            // Process all configured labels
-            const labelsAddedResults = await Promise.all(
+            const processLabels = Object.entries(labelConfig)
-              Object.entries(labelConfig).map(([labelName, config]) => 
+              .map(([labelName, config]) => processLabel(labelName, config));
-                processLabel(labelName, config).then(added => ({ labelName, added }))
+            const labelsAdded = await Promise.all(processLabels);
-              )
+            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
-            );
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
            const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
            // Return which labels were added for the next step
            const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
            core.setOutput('labels_added', JSON.stringify(addedLabels));
            return addedLabels;
      - name: CC users for labeled issues
        if: steps.label-step.outputs.labels_added != '[]'
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
            // Configuration: Map labels to GitHub users to CC
            // You can add multiple users per label, and multiple label configurations
            const ccConfig = {
              rocm: {
                users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3']
                message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions
              },
              // Add more label -> user mappings here
              // Example:
              // cuda: {
              //   users: ['user1', 'user2'],
              //   message: 'CC {users} for CUDA-related issue'
              // },
              // performance: {
              //   users: ['perfexpert'],
              //   message: 'CC {users} for performance issue'
              // },
            };
            const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
            core.notice(`Labels added: ${labelsAdded.join(', ')}`);
            // Get existing comments to check for already mentioned users
            const comments = await github.rest.issues.listComments({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
            });
            const issueBody = context.payload.issue.body || '';
            const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
            // Process each label that was added
            for (const label of labelsAdded) {
              if (ccConfig[label]) {
                const config = ccConfig[label];
                const usersToMention = [];
                // Check which users haven't been mentioned yet
                for (const user of config.users) {
                  const mentionPattern = new RegExp(`@${user}\\b`, 'i');
                  if (!mentionPattern.test(allExistingText)) {
                    usersToMention.push(user);
                  } else {
                    core.notice(`@${user} already mentioned for label "${label}", skipping`);
                  }
                }
                // Post comment if there are users to mention
                if (usersToMention.length > 0) {
                  const mentions = usersToMention.map(u => `@${u}`).join(' ');
                  const message = config.message.replace('{users}', mentions);
                  await github.rest.issues.createComment({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    issue_number: context.issue.number,
                    body: message
                  });
                  core.notice(`CC comment added for label "${label}": ${mentions}`);
                } else {
                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
                }
              }
            }
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -16,7 +16,6 @@ repos:
  rev: v1.38.1
  hooks:
  - id: typos
    args: [--force-exclude]
 - repo: https://github.com/pre-commit/mirrors-clang-format
  rev: v21.1.2
  hooks:
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -8,6 +8,7 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
 from typing import Optional, Union
 import aiohttp
 import huggingface_hub.constants
@ -27,13 +28,13 @@ class RequestFuncInput:
    prompt_len: int
    output_len: int
    model: str
-    model_name: str | None = None
+    model_name: Optional[str] = None
-    logprobs: int | None = None
+    logprobs: Optional[int] = None
-    extra_body: dict | None = None
+    extra_body: Optional[dict] = None
-    multi_modal_content: dict | list[dict] | None = None
+    multi_modal_content: Optional[dict | list[dict]] = None
    ignore_eos: bool = False
-    language: str | None = None
+    language: Optional[str] = None
-    request_id: str | None = None
+    request_id: Optional[str] = None
@dataclass
@ -51,7 +52,7 @@ class RequestFuncOutput:
 async def async_request_tgi(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -132,7 +133,7 @@ async def async_request_tgi(
 async def async_request_trt_llm(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -203,7 +204,7 @@ async def async_request_trt_llm(
 async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -266,7 +267,7 @@ async def async_request_deepspeed_mii(
 async def async_request_openai_completions(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -366,7 +367,7 @@ async def async_request_openai_completions(
 async def async_request_openai_chat_completions(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("chat/completions", "profile")), (
@ -475,7 +476,7 @@ async def async_request_openai_chat_completions(
 async def async_request_openai_audio(
    request_func_input: RequestFuncInput,
-    pbar: tqdm | None = None,
+    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile
@ -609,7 +610,7 @@ def get_tokenizer(
    tokenizer_mode: str = "auto",
    trust_remote_code: bool = False,
    **kwargs,
-) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
    if pretrained_model_name_or_path is not None and not os.path.exists(
        pretrained_model_name_or_path
    ):
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -32,6 +32,7 @@ import dataclasses
 import json
 import random
 import time
 from typing import Optional
 from transformers import PreTrainedTokenizerBase
@ -79,7 +80,7 @@ def sample_requests_from_dataset(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: int | None,
+    fixed_output_len: Optional[int],
 ) -> list[Request]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
@ -127,7 +128,7 @@ def sample_requests_from_random(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: int | None,
+    fixed_output_len: Optional[int],
    prefix_len: int,
 ) -> list[Request]:
    requests = []
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -7,6 +7,7 @@ import dataclasses
 import json
 import random
 import time
 from typing import Optional
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -23,7 +24,7 @@ def sample_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: int | None,
+    fixed_output_len: Optional[int],
 ) -> list[tuple[str, int, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -32,6 +32,7 @@ import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
 from typing import Optional
 import datasets
 import numpy as np
@ -315,7 +316,7 @@ def calculate_metrics(
    tokenizer: PreTrainedTokenizerBase,
    selected_percentile_metrics: list[str],
    selected_percentiles: list[float],
-    goodput_config_dict: dict[str, float] | None = None,
+    goodput_config_dict: Optional[dict[str, float]] = None,
 ) -> tuple[BenchmarkMetrics, list[int]]:
    actual_output_lens: list[int] = []
    total_input = 0
@ -435,9 +436,9 @@ async def benchmark(
    selected_percentile_metrics: list[str],
    selected_percentiles: list[str],
    ignore_eos: bool,
-    max_concurrency: int | None,
+    max_concurrency: Optional[int],
    structured_output_ratio: float,
-    goodput_config_dict: dict[str, float] | None = None,
+    goodput_config_dict: Optional[dict[str, float]] = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@ -6,7 +6,7 @@ import math
 import os
 import time
 from types import TracebackType
-from typing import Any
+from typing import Any, Optional, Union
 def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
    def __init__(self, scale: int) -> None:
        self.cnt: int = 0
        self._sum: int = 0
-        self._max: int | None = None
+        self._max: Optional[int] = None
        self.scale = scale
        self.start_time: int = time.monotonic_ns()
@ -104,13 +104,13 @@ class TimeCollector:
        else:
            self._max = max(self._max, v)
-    def avg(self) -> float | str:
+    def avg(self) -> Union[float, str]:
        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
-    def max(self) -> float | str:
+    def max(self) -> Union[float, str]:
        return self._max / self.scale if self._max else "N/A"
-    def dump_avg_max(self) -> list[float | str]:
+    def dump_avg_max(self) -> list[Union[float, str]]:
        return [self.avg(), self.max()]
    def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:
    def __exit__(
        self,
-        exc_type: type[BaseException] | None,
+        exc_type: Optional[type[BaseException]],
-        exc_value: BaseException | None,
+        exc_value: Optional[BaseException],
-        exc_traceback: TracebackType | None,
+        exc_traceback: Optional[TracebackType],
    ) -> None:
        self.collect(time.monotonic_ns() - self.start_time)
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -6,7 +6,8 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from typing import Callable
 import torch
 import torch.utils.benchmark as TBenchmark
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -6,7 +6,8 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from typing import Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@ -52,7 +53,7 @@ def bench_int8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark INT8-based kernels."""
    assert dtype == torch.int8
@ -107,7 +108,7 @@ def bench_fp8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark FP8-based kernels."""
    assert dtype == torch.float8_e4m3fn
@ -182,7 +183,7 @@ def bench(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    if dtype == torch.int8:
        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -200,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(
    dtype: torch.dtype,
    MKNs: Iterable[tuple[int, int, int]],
-    bench_kernels: list[str] | None = None,
+    bench_kernels: Optional[list[str]] = None,
 ) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -3,9 +3,10 @@
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import product
 from typing import Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@ -50,7 +51,7 @@ def get_bench_params() -> list[bench_params_t]:
 def unfused_int8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: torch.Tensor | None,
+    residual: Optional[torch.Tensor],
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -67,7 +68,7 @@ def unfused_int8_impl(
 def unfused_fp8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: torch.Tensor | None,
+    residual: Optional[torch.Tensor],
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -84,7 +85,7 @@ def unfused_fp8_impl(
 def fused_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
-    residual: torch.Tensor | None,
+    residual: Optional[torch.Tensor],
    quant_dtype: torch.dtype,
 ):
    out, _ = ops.rms_norm_dynamic_per_token_quant(
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from collections.abc import Callable
+from typing import Callable
 from unittest.mock import patch
 import pandas as pd
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -22,8 +22,8 @@ Example:
 import json
 import os
 import time
 from collections.abc import Callable
 from contextlib import nullcontext
 from typing import Callable, Optional
 import torch
 import torch.distributed as dist
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
    def benchmark_allreduce_single(
        self,
        sequence_length: int,
-        allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
+        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
        should_use_fn: Callable[[torch.Tensor], bool],
        context,
        num_warmup: int,
        num_trials: int,
-    ) -> float | None:
+    ) -> Optional[float]:
        """Benchmark method with CUDA graph optimization."""
        try:
            # Create test tensor (2D: sequence_length x hidden_size)
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -6,12 +6,11 @@ import copy
 import json
 import pickle
 import time
 from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum, auto
 from itertools import product
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@ -159,7 +158,7 @@ def ref_group_gemm(
    seq_lens_cpu: torch.Tensor,
    prompt_lora_mapping_cpu: torch.Tensor,
    scaling: float,
-    add_inputs: bool | None,
+    add_inputs: Optional[bool],
 ):
    """
    Torch group gemm reference implementation to test correctness of
@ -317,8 +316,8 @@ class BenchmarkContext:
    lora_rank: int
    sort_by_lora_id: bool
    dtype: torch.dtype
-    seq_length: int | None = None
+    seq_length: Optional[int] = None
-    num_slices: int | None = None  # num_slices for slice based ops
+    num_slices: Optional[int] = None  # num_slices for slice based ops
    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
        ctx = copy.copy(self)
@ -562,7 +561,7 @@ class BenchmarkTensors:
        }
    def bench_fn_kwargs(
-        self, op_type: OpType, add_inputs: bool | None = None
+        self, op_type: OpType, add_inputs: Optional[bool] = None
    ) -> dict[str, Any]:
        if op_type.is_shrink_fn():
            assert add_inputs is None
@ -576,7 +575,7 @@ class BenchmarkTensors:
        raise ValueError(f"Unrecognized optype {self}")
    def test_correctness(
-        self, op_type: OpType, expand_fn_add_inputs: bool | None
+        self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
    ) -> bool:
        """
        Test correctness of op_type implementation against a grouped gemm
@ -612,8 +611,8 @@ def bench_optype(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: int | None = None,
+    cuda_graph_nops: Optional[int] = None,
-    expand_fn_add_inputs: bool | None = None,
+    expand_fn_add_inputs: Optional[bool] = None,
    test_correctness: bool = False,
 ) -> TMeasurement:
    assert arg_pool_size >= 1
@ -680,7 +679,7 @@ def bench_torch_mm(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: int | None = None,
+    cuda_graph_nops: Optional[int] = None,
 ) -> TMeasurement:
    """
    Benchmark basic torch.mm as a roofline.
@ -745,7 +744,7 @@ def use_cuda_graph_recommendation() -> str:
            """
-def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
+def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
    compare = TBenchmark.Compare(timers)
    compare.print()
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -8,9 +8,10 @@ import math
 import os
 import pickle as pkl
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import product
 from typing import Callable, Optional
 import pandas as pd
 import torch
@ -62,23 +63,23 @@ class BenchmarkTensors:
    a: torch.Tensor
    w_q: torch.Tensor
-    group_size: int | None
+    group_size: Optional[int]
    wtype: ScalarType
    w_g_s: torch.Tensor
-    w_g_zp: torch.Tensor | None
+    w_g_zp: Optional[torch.Tensor]
-    w_ch_s: torch.Tensor | None
+    w_ch_s: Optional[torch.Tensor]
-    w_tok_s: torch.Tensor | None
+    w_tok_s: Optional[torch.Tensor]
@dataclass
 class TypeConfig:
    act_type: torch.dtype
    weight_type: ScalarType
-    output_type: torch.dtype | None
+    output_type: Optional[torch.dtype]
-    group_scale_type: torch.dtype | None
+    group_scale_type: Optional[torch.dtype]
-    group_zero_type: torch.dtype | None
+    group_zero_type: Optional[torch.dtype]
-    channel_scale_type: torch.dtype | None
+    channel_scale_type: Optional[torch.dtype]
-    token_scale_type: torch.dtype | None
+    token_scale_type: Optional[torch.dtype]
 def rand_data(shape, dtype=torch.float16, scale=1):
@ -92,8 +93,8 @@ def quantize_and_pack(
    atype: torch.dtype,
    w: torch.Tensor,
    wtype: ScalarType,
-    stype: torch.dtype | None,
+    stype: Optional[torch.dtype],
-    group_size: int | None,
+    group_size: Optional[int],
    zero_points: bool = False,
 ):
    assert wtype.is_integer(), "TODO: support floating point weights"
@ -112,7 +113,7 @@ def quantize_and_pack(
 def create_bench_tensors(
-    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
+    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
 ) -> list[BenchmarkTensors]:
    m, n, k = shape
@ -330,8 +331,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
    return res
-_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
+_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
-_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
+_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
 def bench(
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
    else:
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    block_quant_shape = get_weight_block_size_safety(config)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_customized_permute = args.use_customized_permute
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -3,6 +3,7 @@
 import random
 import time
 from typing import Optional
 import torch
@ -36,7 +37,7 @@ def main(
    seed: int,
    do_profile: bool,
    device: str = "cuda",
-    kv_cache_dtype: str | None = None,
+    kv_cache_dtype: Optional[str] = None,
 ) -> None:
    current_platform.seed_everything(seed)
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@ -3,8 +3,8 @@
 import argparse
 import math
 from collections.abc import Callable
 from contextlib import contextmanager
 from typing import Callable
 from unittest.mock import patch
 import torch
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import random
 import time
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import random
 import time
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 from typing import Optional, Union
 import torch
 from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -20,8 +21,8 @@ class HuggingFaceRMSNorm(nn.Module):
    def forward(
        self,
        x: torch.Tensor,
-        residual: torch.Tensor | None = None,
+        residual: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
        orig_dtype = x.dtype
        x = x.to(torch.float32)
        if residual is not None:
@ -40,7 +41,7 @@ class HuggingFaceRMSNorm(nn.Module):
 def rmsnorm_naive(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: torch.Tensor | None = None,
+    residual: Optional[torch.Tensor] = None,
    eps: float = 1e-6,
 ):
    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -64,7 +65,7 @@ def rmsnorm_naive(
 def rmsnorm_flashinfer(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: torch.Tensor | None = None,
+    residual: Optional[torch.Tensor] = None,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
@ -88,7 +89,7 @@ def rmsnorm_flashinfer(
 def rmsnorm_vllm(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: torch.Tensor | None = None,
+    residual: Optional[torch.Tensor] = None,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from itertools import accumulate
 from typing import Optional
 import nvtx
 import torch
@ -17,7 +18,7 @@ def benchmark_rope_kernels_multi_lora(
    seq_len: int,
    num_heads: int,
    head_size: int,
-    rotary_dim: int | None,
+    rotary_dim: Optional[int],
    dtype: torch.dtype,
    seed: int,
    device: str,
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@ -4,6 +4,7 @@
 import csv
 import os
 from datetime import datetime
 from typing import Optional
 import flashinfer
 import torch
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
 def benchmark_decode(
    dtype: torch.dtype,
-    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
+    quant_dtypes: tuple[
        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
    ],
    batch_size: int,
    max_seq_len: int,
    num_heads: tuple[int, int] = (64, 8),
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@ -4,6 +4,7 @@
 import csv
 import os
 from datetime import datetime
 from typing import Optional
 import flashinfer
 import torch
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
 def benchmark_prefill(
    dtype: torch.dtype,
-    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
+    quant_dtypes: tuple[
        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
    ],
    batch_size: int,
    max_seq_len: int,
    num_heads: tuple[int, int] = (64, 8),
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
-from typing import Any
+from typing import Any, Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:
    def __init__(
        self,
-        cuda_graph_params: CudaGraphBenchParams | None,
+        cuda_graph_params: Optional[CudaGraphBenchParams],
        label: str,
        sub_label: str,
        description: str,
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from statistics import mean
-from typing import Any, NamedTuple
+from typing import Any, NamedTuple, Optional, Union
 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
 class UniformDistribution(Distribution):
    def __init__(
        self,
-        min_val: int | float,
+        min_val: Union[int, float],
-        max_val: int | float,
+        max_val: Union[int, float],
        is_integer: bool = True,
    ) -> None:
        self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
 class ConstantDistribution(Distribution):
-    def __init__(self, value: int | float) -> None:
+    def __init__(self, value: Union[int, float]) -> None:
        self.value = value
        self.max_val = value
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
 class ZipfDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: int | None = None) -> None:
+    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
        self.alpha = alpha
        self.max_val = max_val
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
 class PoissonDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: int | None = None) -> None:
+    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
        self.alpha = alpha
        self.max_val = max_val
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
 class LognormalDistribution(Distribution):
    def __init__(
        self,
-        mean: float | None = None,
+        mean: Optional[float] = None,
-        sigma: float | None = None,
+        sigma: Optional[float] = None,
-        average: int | None = None,
+        average: Optional[int] = None,
-        median_ratio: float | None = None,
+        median_ratio: Optional[float] = None,
-        max_val: int | None = None,
+        max_val: Optional[int] = None,
    ) -> None:
        self.average = average
        self.median_ratio = median_ratio
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -13,7 +13,7 @@ from datetime import datetime
 from enum import Enum
 from http import HTTPStatus
 from statistics import mean
-from typing import NamedTuple
+from typing import NamedTuple, Union
 import aiohttp  # type: ignore
 import numpy as np  # type: ignore
@ -169,7 +169,7 @@ class MovingAverage:
 class DebugStats:
    def __init__(self, logger: logging.Logger, window_size: int) -> None:
        self.logger = logger
-        self.metrics: dict[str, MovingAverage | MetricStats] = {
+        self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
            "moving_avg_ttft_ms": MovingAverage(window_size),
            "moving_avg_tpot_ms": MovingAverage(window_size),
            "ttft_ms": MetricStats(),
@ -636,7 +636,7 @@ async def client_main(
            if args.verbose:
                curr_time_sec: float = time.perf_counter()
-                time_since_last_turn: str | float = "N/A"
+                time_since_last_turn: Union[str, float] = "N/A"
                if conv_id in time_of_last_turn:
                    time_since_last_turn = round(
                        curr_time_sec - time_of_last_turn[conv_id], 3
@ -928,13 +928,13 @@ async def main_mp(
                    f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
                )
-                rps: str | float = round(len(client_metrics) / runtime_sec, 3)
+                rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
                if len(client_metrics) < (5 * bench_args.num_clients):
                    # Do not estimate the RPS if the number of samples is very low
                    # (threshold can be tuned if needed)
                    rps = "N/A"
-                runtime_left_sec: str | float = round(
+                runtime_left_sec: Union[str, float] = round(
                    (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
                )
                if percent < 0.05:
--- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
@ -13,7 +13,7 @@ import argparse
 import json
 import random
 from statistics import mean
-from typing import Any
+from typing import Any, Optional
 import pandas as pd  # type: ignore
 import tqdm  # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
 def content_is_valid(
-    content: str, min_content_len: int | None, max_content_len: int | None
+    content: str, min_content_len: Optional[int], max_content_len: Optional[int]
 ) -> bool:
    if min_content_len and len(content) < min_content_len:
        return False
@ -37,7 +37,7 @@ def content_is_valid(
 def print_stats(
-    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
+    conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
 ) -> None:
    # Collect statistics
    stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
    seed: int,
    input_file: str,
    output_file: str,
-    max_items: int | None,
+    max_items: Optional[int],
-    min_content_len: int | None = None,
+    min_content_len: Optional[int] = None,
-    max_content_len: int | None = None,
+    max_content_len: Optional[int] = None,
-    min_turns: int | None = None,
+    min_turns: Optional[int] = None,
-    max_turns: int | None = None,
+    max_turns: Optional[int] = None,
-    model: str | None = None,
+    model: Optional[str] = None,
 ) -> None:
    if min_turns and max_turns:
        assert min_turns <= max_turns
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@ -22,10 +22,10 @@ else()
    CONFIGURE_COMMAND ""
    BUILD_COMMAND ""
  )
  FetchContent_Populate(qutlass)
  set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
 endif()
 FetchContent_Populate(qutlass)
 if(NOT qutlass_SOURCE_DIR)
  message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
 endif()
--- a/codecov.yml
+++ b/codecov.yml
@ -1,12 +0,0 @@
 codecov:
  require_ci_to_pass: false
 fixes:
  # Map source code paths to repository root paths
  # Wildcards match any Python version (python3.*)
  - "/vllm-workspace/src/vllm/::vllm/"
  - "/vllm-workspace/vllm/::vllm/"
  - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
  - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
  - "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
  - "/usr/lib/python3.*/site-packages/vllm/::vllm/"
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 from typing import Union
 from cutlass_library import *
@ -21,7 +22,7 @@ class MixedInputKernelScheduleType(enum.Enum):
    TmaWarpSpecializedCooperative = enum_auto()
-VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
+VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
    **DataTypeNames,  # type: ignore
    **{
        VLLMDataType.u4b8: "u4b8",
@ -29,7 +30,7 @@ VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
    },
 }
-VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
+VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    **DataTypeTag,  # type: ignore
    **{
        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@ -37,7 +38,7 @@ VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
    },
 }
-VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
+VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
    **DataTypeSize,  # type: ignore
    **{
        VLLMDataType.u4b8: 4,
@ -45,7 +46,7 @@ VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
    },
 }
-VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
+VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    VLLMDataType.u4b8: "vllm::kU4B8",
    VLLMDataType.u8b128: "vllm::kU8B128",
    DataType.u4: "vllm::kU4",
@ -56,7 +57,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
    DataType.bf16: "vllm::kBfloat16",
 }
-VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
+VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    DataType.u8: "at::ScalarType::Byte",
    DataType.s8: "at::ScalarType::Char",
    DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@ -66,7 +67,9 @@ VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
    DataType.f32: "at::ScalarType::Float",
 }
-VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
+VLLMKernelScheduleTag: dict[
    Union[MixedInputKernelScheduleType, KernelScheduleType], str
 ] = {
    **KernelScheduleTag,  # type: ignore
    **{
        MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -2,7 +2,6 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include "quantization/vectorization_utils.cuh"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -19,22 +18,11 @@ __global__ void rms_norm_kernel(
    const float epsilon, const int num_tokens, const int hidden_size) {
  __shared__ float s_variance;
  float variance = 0.0f;
  const scalar_t* input_row = input + blockIdx.x * input_stride;
-  constexpr int VEC_SIZE = 8;
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
+    const float x = (float)input[blockIdx.x * input_stride + idx];
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
      float x = static_cast<float>(vec.val[i]);
      variance += x * x;
    }
  };
  auto scalar_op = [&variance](const scalar_t& val) {
    float x = static_cast<float>(val);
    variance += x * x;
-  };
+  }
  vllm::vectorize_read_with_alignment<VEC_SIZE>(
      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -10,7 +10,6 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include "quantization/vectorization_utils.cuh"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -29,22 +28,10 @@ __global__ void rms_norm_static_fp8_quant_kernel(
  __shared__ float s_variance;
  float variance = 0.0f;
-  const scalar_t* input_row = input + blockIdx.x * input_stride;
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-
+    const float x = (float)input[blockIdx.x * input_stride + idx];
  constexpr int VEC_SIZE = 8;
  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
      float x = static_cast<float>(vec.val[i]);
      variance += x * x;
    }
  };
  auto scalar_op = [&variance](const scalar_t& val) {
    float x = static_cast<float>(val);
    variance += x * x;
-  };
+  }
  vllm::vectorize_read_with_alignment<VEC_SIZE>(
      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -21,6 +21,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"
 #include "../core/batch_invariant.hpp"
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -405,7 +406,8 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
-    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
+    const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
    const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -9,6 +9,7 @@ from collections.abc import Iterable
 from copy import deepcopy
 from dataclasses import dataclass, fields
 from functools import reduce
 from typing import Optional, Union
 import jinja2
 from vllm_cutlass_library_extension import (
@ -258,7 +259,7 @@ class ScheduleConfig:
@dataclass(frozen=True)
 class TypeConfig:
    a: DataType
-    b: DataType | VLLMDataType
+    b: Union[DataType, VLLMDataType]
    b_group_scale: DataType
    b_group_zeropoint: DataType
    b_channel_scale: DataType
@ -279,7 +280,7 @@ class PrepackTypeConfig:
 class ImplConfig:
    types: TypeConfig
    schedules: list[ScheduleConfig]
-    heuristic: list[tuple[str | None, ScheduleConfig]]
+    heuristic: list[tuple[Optional[str], ScheduleConfig]]
 def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
--- a/csrc/quickreduce/quick_reduce.h
+++ b/csrc/quickreduce/quick_reduce.h
@ -22,14 +22,13 @@ template <typename AllReduceKernel, typename T>
 __global__ __quickreduce_launch_bounds_two_shot__ static void
 allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
                            int rank, uint8_t** dbuffer_list,
-                            uint32_t data_offset, uint32_t flag_color,
+                            uint32_t data_offset, uint32_t flag_color) {
                            int64_t data_size_per_phase) {
  int block = blockIdx.x;
  int grid = gridDim.x;
  while (block < num_blocks) {
    AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
-                         flag_color, data_size_per_phase);
+                         flag_color);
    block += grid;
    flag_color++;
  }
@ -42,21 +41,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color, this->kMaxProblemSize);                  \
+                       flag_color);                                         \
  } else if (world_size == 4) {                                             \
    using LineCodec = __codec<T, 4>;                                        \
    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color, this->kMaxProblemSize);                  \
+                       flag_color);                                         \
  } else if (world_size == 8) {                                             \
    using LineCodec = __codec<T, 8>;                                        \
    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color, this->kMaxProblemSize);                  \
+                       flag_color);                                         \
  }
 enum QuickReduceQuantLevel {
--- a/csrc/quickreduce/quick_reduce_impl.cuh
+++ b/csrc/quickreduce/quick_reduce_impl.cuh
@ -553,12 +553,13 @@ struct AllReduceTwoshot {
      int const rank,                      // rank index
      uint8_t** __restrict__ buffer_list,  // communication buffers
      uint32_t const data_offset,          // offset to start of the data buffer
-      uint32_t flag_color, int64_t data_size_per_phase) {
+      uint32_t flag_color) {
    // Topology
    int thread = threadIdx.x + threadIdx.y * kWavefront;
    uint8_t* rank_buffer = buffer_list[rank];
    Codec codec(thread, rank);
    int block_id = blockIdx.x;
    int grid_size = gridDim.x;
    // --------------------------------------------------------
    // Read input into registers
    int32x4_t tA[kAtoms];
@ -587,10 +588,12 @@ struct AllReduceTwoshot {
    // rank responsible for this segment.
    uint32_t comm_data0_offset =
        data_offset + block_id * Codec::kTransmittedTileSize;
-    uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset;
+    uint32_t comm_data1_offset =
        grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
    uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
-    uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset;
+    uint32_t comm_flags1_offset =
        grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
    for (int r = 0; r < kWorldSize; r++) {
      int32x4_t* send_buffer =
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=500
+ARG VLLM_MAX_SIZE_MB=450
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@ -11,7 +11,8 @@ The following code splits the model across 2 GPUs.
 ```python
 from vllm import LLM
-llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
+llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
          tensor_parallel_size=2)
 ```
 !!! warning
@ -42,7 +43,9 @@ and the maximum batch size (`max_num_seqs` option).
 ```python
 from vllm import LLM
-llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
+llm = LLM(model="adept/fuyu-8b",
          max_model_len=2048,
          max_num_seqs=2)
 ```
 ## Reduce CUDA Graphs
@ -58,12 +61,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
    ```python
    from vllm import LLM
-    from vllm.config import CompilationConfig, CompilationMode
+    from vllm.config import CompilationConfig, CompilationLevel
    llm = LLM(
        model="meta-llama/Llama-3.1-8B-Instruct",
        compilation_config=CompilationConfig(
-            mode=CompilationMode.VLLM_COMPILE,
+            level=CompilationLevel.PIECEWISE,
            # By default, it goes up to max_num_seqs
            cudagraph_capture_sizes=[1, 2, 4, 8, 16],
        ),
@ -75,7 +78,8 @@ You can disable graph capturing completely via the `enforce_eager` flag:
 ```python
 from vllm import LLM
-llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
          enforce_eager=True)
 ```
 ## Adjust cache size
@ -93,10 +97,8 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
 from vllm import LLM
 # Accept up to 3 images and 1 video per prompt
-llm = LLM(
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-    model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"image": 3, "video": 1})
    limit_mm_per_prompt={"image": 3, "video": 1},
 )
 ```
 You can go a step further and disable unused modalities completely by setting its limit to zero.
@ -106,10 +108,8 @@ For example, if your application only accepts image input, there is no need to a
 from vllm import LLM
 # Accept any number of images but no videos
-llm = LLM(
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-    model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"video": 0})
    limit_mm_per_prompt={"video": 0},
 )
 ```
 You can even run a multi-modal model for text-only inference:
@ -118,10 +118,8 @@ You can even run a multi-modal model for text-only inference:
 from vllm import LLM
 # Don't accept images. Just text.
-llm = LLM(
+llm = LLM(model="google/gemma-3-27b-it",
-    model="google/gemma-3-27b-it",
+          limit_mm_per_prompt={"image": 0})
    limit_mm_per_prompt={"image": 0},
 )
 ```
 ### Configurable options
@ -175,14 +173,14 @@ Here are some examples:
 from vllm import LLM
 # Available for Qwen2-VL series models
-llm = LLM(
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-    model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_kwargs={
-    mm_processor_kwargs={"max_pixels": 768 * 768},  # Default is 1280 * 28 * 28
+              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
-)
+          })
 # Available for InternVL series models
-llm = LLM(
+llm = LLM(model="OpenGVLab/InternVL2-2B",
-    model="OpenGVLab/InternVL2-2B",
+          mm_processor_kwargs={
-    mm_processor_kwargs={"max_dynamic_patch": 4},  # Default is 12
+              "max_dynamic_patch": 4,  # Default is 12
-)
+          })
 ```
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -100,7 +100,7 @@ from vllm import LLM
 llm = LLM(
    model="meta-llama/Llama-3.3-70B-Instruct,
    tensor_parallel_size=4,
-    pipeline_parallel_size=2,
+    pipeline_parallel_size=2
 )
 ```
@ -257,24 +257,18 @@ Examples:
 ```python
 # Use a larger cache
-llm = LLM(
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-    model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_cache_gb=8)
    mm_processor_cache_gb=8,
 )
 # Use a shared-memory based IPC cache
-llm = LLM(
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-    model="Qwen/Qwen2.5-VL-3B-Instruct",
+          tensor_parallel_size=2,
-    tensor_parallel_size=2,
+          mm_processor_cache_type="shm",
-    mm_processor_cache_type="shm",
+          mm_processor_cache_gb=8)
    mm_processor_cache_gb=8,
 )
 # Disable the cache
-llm = LLM(
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-    model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_cache_gb=0)
    mm_processor_cache_gb=0,
 )
 ```
 ### Cache Placement
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -35,7 +35,6 @@ th {
 | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
 | Random | ✅ | ✅ | `synthetic` |
 | RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
 | RandomForReranking | ✅ | ✅ | `synthetic` |
 | Prefix Repetition | ✅ | ✅ | `synthetic` |
 | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
 | HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
@ -879,51 +878,6 @@ vllm bench serve \
 </details>
 #### Reranker Benchmark
 Benchmark the performance of rerank requests in vLLM.
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 Unlike generative models which use Completions API or Chat Completions API,
 you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API.
 For reranking, the only supported dataset is `--dataset-name random-rerank`
 Start the server:
 ```bash
 vllm serve BAAI/bge-reranker-v2-m3
 ```
 Run the benchmark:
 ```bash
 vllm bench serve \
  --model BAAI/bge-reranker-v2-m3 \
  --backend vllm-rerank \
  --endpoint /v1/rerank \
  --dataset-name random-rerank \
  --tokenizer BAAI/bge-reranker-v2-m3 \
  --random-input-len 512 \
  --num-prompts 10 \
  --random-batch-size 5
 ```
 For reranker models, this will create `num_prompts / random_batch_size` requests with
 `random_batch_size` "documents" where each one has close to `random_input_len` tokens.
 In the example above, this results in 2 rerank requests with 5 "documents" each where
 each document has close to 512 tokens.
 Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
 with an embedding model, also set `--no_reranker`. Because in this case the query is
 treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
 to account for the extra prompt which is the query. The token accounting to report the
 throughput numbers correctly is also adjusted.
 </details>
 [](){ #performance-benchmarks }
 ## Performance Benchmarks
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -73,8 +73,8 @@ def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
-    intermediate_tensors: IntermediateTensors | None = None,
+    intermediate_tensors: Optional[IntermediateTensors] = None,
-    inputs_embeds: torch.Tensor | None = None,
+    inputs_embeds: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
    ...
 ```
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -16,7 +16,7 @@ Further update the model as follows:
            ...
            @classmethod
-            def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+            def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
                if modality.startswith("image"):
                    return "<image>"
@ -45,14 +45,14 @@ Further update the model as follows:
            ...
            def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
                assert self.vision_encoder is not None
                image_features = self.vision_encoder(image_input)
                return self.multi_modal_projector(image_features)
            def get_multimodal_embeddings(
-                self,
+                    self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-                **kwargs: object,
+
            ) -> MultiModalEmbeddings | None:
                # Validate the multimodal input keyword arguments
                image_input = self._parse_and_validate_image_input(**kwargs)
                if image_input is None:
@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
 For example, if the model supports any number of images but only one video per prompt:
 ```python
-def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
    return {"image": None, "video": 1}
 ```
@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
            self,
            seq_len: int,
            mm_counts: Mapping[str, int],
-            mm_options: Mapping[str, BaseDummyOptions] | None = None,
+            mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
        ) -> MultiModalDataDict:
            num_images = mm_counts.get("image", 0)
@ -421,10 +421,8 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
    ```python
    def get_image_size_with_most_features(self) -> ImageSize:
        image_processor = self.get_image_processor()
-        return ImageSize(
+        return ImageSize(width=image_processor.size["width"],
-            width=image_processor.size["width"],
+                            height=image_processor.size["height"])
            height=image_processor.size["height"],
        )
    ```
    Fuyu does not expect image placeholders in the inputs to HF processor, so
@ -454,12 +452,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
            return {
                "image":
-                self._get_dummy_images(
+                self._get_dummy_images(width=target_width,
-                    width=target_width,
+                                    height=target_height,
-                    height=target_height,
+                                    num_images=num_images,
-                    num_images=num_images,
+                                    overrides=image_overrides)
                    overrides=image_overrides,
                )
            }
        ```
@ -748,7 +744,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                image_width=image_size.width,
                image_height=image_size.height,
            )
-            image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
+            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                            [_NEWLINE_TOKEN_ID]) * nrows
            return PromptUpdateDetails.select_token_id(
                image_tokens + [bos_token_id],
@ -784,7 +781,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                    image_width=image_size.width,
                    image_height=image_size.height,
                )
-                image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
+                image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                                [_NEWLINE_TOKEN_ID]) * nrows
                return PromptUpdateDetails.select_token_id(
                    image_tokens + [bos_token_id],
@ -812,11 +810,9 @@ to register them to the multi-modal registry:
  from vllm.model_executor.models.interfaces import SupportsMultiModal
 + from vllm.multimodal import MULTIMODAL_REGISTRY
-+ @MULTIMODAL_REGISTRY.register_processor(
+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
-+     YourMultiModalProcessor,
+                                         info=YourProcessingInfo,
-+     info=YourProcessingInfo,
+                                         dummy_inputs=YourDummyInputsBuilder)
 +     dummy_inputs=YourDummyInputsBuilder,
 + )
  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@ -42,7 +42,7 @@ def register():
    ModelRegistry.register_model(
        "YourModelForCausalLM",
-        "your_code:YourModelForCausalLM",
+        "your_code:YourModelForCausalLM"
    )
 ```
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@ -15,9 +15,8 @@ Declare supported languages and capabilities:
 - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
 ??? code "supported_languages and supports_transcription_only"
    ```python
-    from typing import ClassVar, Mapping, Literal
+    from typing import ClassVar, Mapping, Optional, Literal
    import numpy as np
    import torch
    from torch import nn
@ -44,7 +43,6 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
 This is for controlling general behavior of the API when serving your model:
 ??? code "get_speech_to_text_config()"
    ```python
    class YourASRModel(nn.Module, SupportsTranscription):
        ...
@ -73,7 +71,6 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
 Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
 ??? code "get_generation_prompt()"
    ```python
    class YourASRModel(nn.Module, SupportsTranscription):
        ...
@ -84,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
            audio: np.ndarray,
            stt_config: SpeechToTextConfig,
            model_config: ModelConfig,
-            language: str | None,
+            language: Optional[str],
            task_type: Literal["transcribe", "translate"],
            request_prompt: str,
-            to_language: str | None,
+            to_language: Optional[str],
        ) -> PromptType:
            # Example with a free-form instruction prompt
            task_word = "Transcribe" if task_type == "transcribe" else "Translate"
@ -110,7 +107,6 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
 Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
 ??? code "get_generation_prompt()"
    ```python
    class YourASRModel(nn.Module, SupportsTranscription):
        ...
@ -121,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
            audio: np.ndarray,
            stt_config: SpeechToTextConfig,
            model_config: ModelConfig,
-            language: str | None,
+            language: Optional[str],
            task_type: Literal["transcribe", "translate"],
            request_prompt: str,
-            to_language: str | None,
+            to_language: Optional[str],
        ) -> PromptType:
            if language is None:
                raise ValueError("Language must be specified")
@ -152,16 +148,12 @@ Language validation via [validate_language][vllm.model_executor.models.interface
 If your model requires a language and you want a default, override this method (see Whisper):
 ??? code "validate_language()"
    ```python
    @classmethod
-    def validate_language(cls, language: str | None) -> str | None:
+    def validate_language(cls, language: Optional[str]) -> Optional[str]:
        if language is None:
            logger.warning(
-                "Defaulting to language='en'. If you wish to transcribe "
+                "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
                "audio in a different language, pass the `language` field "
                "in the TranscriptionRequest."
            )
            language = "en"
        return super().validate_language(language)
    ```
@ -173,7 +165,6 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
 Provide a fast duration→token estimate to improve streaming usage statistics:
 ??? code "get_num_audio_tokens()"
    ```python
    class YourASRModel(nn.Module, SupportsTranscription):
        ...
@ -184,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
            audio_duration_s: float,
            stt_config: SpeechToTextConfig,
            model_config: ModelConfig,
-        ) -> int | None:
+        ) -> Optional[int]:
            # Return None if unknown; otherwise return an estimate.
            return int(audio_duration_s * stt_config.sample_rate // 320)  # example
    ```
@ -200,7 +191,6 @@ The API server takes care of basic audio I/O and optional chunking before buildi
 Relevant server logic:
 ??? code "_preprocess_speech_to_text()"
    ```python
    # vllm/entrypoints/openai/speech_to_text.py
    async def _preprocess_speech_to_text(...):
--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
 ??? console "Command"
-    ```bash
+    ```python
    curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
    -H 'Content-Type: application/json' \
    -H 'Authorization: <JWT TOKEN>' \
@ -81,7 +81,7 @@ You should get a response like:
 ??? console "Response"
-    ```json
+    ```python
    {
        "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
        "result": {
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
    client = OpenAI(
        base_url="https://gateway.<gateway domain>",
-        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
+        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
    )
    completion = client.chat.completions.create(
@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
                "role": "user",
                "content": "Compose a poem that explains the concept of recursion in programming.",
            }
-        ],
+        ]
    )
    print(completion.choices[0].message.content)
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@ -34,7 +34,7 @@ pip install vllm haystack-ai
        api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
        model="mistralai/Mistral-7B-Instruct-v0.1",
        api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
-        generation_kwargs={"max_tokens": 512},
+        generation_kwargs = {"max_tokens": 512}
    )
    response = generator.run(
--- a/docs/deployment/frameworks/hf_inference_endpoints.md
+++ b/docs/deployment/frameworks/hf_inference_endpoints.md
@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
    import os
    client = OpenAI(
-        base_url=DEPLOYMENT_URL,
+        base_url = DEPLOYMENT_URL,
-        api_key=os.environ["HF_TOKEN"],  # https://huggingface.co/settings/tokens
+        api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
    )
    chat_completion = client.chat.completions.create(
-        model="HuggingFaceTB/SmolLM3-3B",
+        model = "HuggingFaceTB/SmolLM3-3B",
-        messages=[
+        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
-                        "text": "Give me a brief explanation of gravity in simple terms.",
+                        "text": "Give me a brief explanation of gravity in simple terms."
                    }
-                ],
+                ]
            }
        ],
-        stream=True,
+        stream = True
    )
    for message in chat_completion:
-        print(message.choices[0].delta.content, end="")
+        print(message.choices[0].delta.content, end = "")
    ```
 !!! note
@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
    import os
    client = OpenAI(
-        base_url=DEPLOYMENT_URL,
+        base_url = DEPLOYMENT_URL,
-        api_key=os.environ["HF_TOKEN"],  # https://huggingface.co/settings/tokens
+        api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
    )
    chat_completion = client.chat.completions.create(
-        model="ibm-granite/granite-docling-258M",
+        model = "ibm-granite/granite-docling-258M",
-        messages=[
+        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
-                            "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png",
+                            "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
-                        },
+                        }
                    },
                    {
                        "type": "text",
-                        "text": "Convert this page to docling.",
+                        "text": "Convert this page to docling."
-                    },
+                    }
                ]
            }
        ],
-        stream=True,
+        stream = True
    )
    for message in chat_completion:
-        print(message.choices[0].delta.content, end="")
+        print(message.choices[0].delta.content, end = "")
    ```
 !!! note
--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@ -36,16 +36,15 @@ pip install vllm litellm
    ```python
    import litellm 
-    messages = [{"content": "Hello, how are you?", "role": "user"}]
+    messages = [{ "content": "Hello, how are you?","role": "user"}]
    # hosted_vllm is prefix key word and necessary
    response = litellm.completion(
-        model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
+                model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
-        messages=messages,
+                messages=messages,
-        api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
+                api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
-        temperature=0.2,
+                temperature=0.2,
-        max_tokens=80,
+                max_tokens=80)
    )
    print(response)
    ```
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@ -40,7 +40,7 @@ pip install -U vllm \
 1. Run the script
-    ```bash
+    ```python
    python retrieval_augmented_generation_with_langchain.py
    ```
@ -78,6 +78,6 @@ pip install vllm \
 1. Run the script:
-    ```bash
+    ```python
    python retrieval_augmented_generation_with_llamaindex.py
    ```
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@ -106,11 +106,9 @@ The dispatch code looks like:
 batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
 runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
 # execution
-with set_forward_context(
+with set_forward_context(..., 
-    ..., 
+            cudagraph_runtime_mode=runtime_mode, 
-    cudagraph_runtime_mode=runtime_mode, 
+            batch_descriptor=batch_descriptor):
    batch_descriptor=batch_descriptor,
 ):
     output = self.model(...)
 ```
@ -167,7 +165,7 @@ class AttentionCGSupport(enum.Enum):
    """NO CUDA Graphs support"""
 ```
-Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
+Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
 The following table lists backends that support full CUDA Graphs at the time of writing.
@ -202,12 +200,12 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
 import vllm
 from vllm.config import CUDAGraphMode
-compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
+compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
 model = vllm.LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+            model="meta-llama/Llama-3.1-8B-Instruct",
-    dtype="auto",
+            dtype='auto',
-    compilation_config=compilation_config,
+            compilation_config = compilation_config,
-)
+        )
 sampling_params = vllm.SamplingParams(
    temperature=0,  # greedy decoding
    max_tokens=1024,
--- a/docs/design/dbo.md
+++ b/docs/design/dbo.md
@ -34,10 +34,10 @@ To enable the DBO system pass in the `--enable-dbo` argument to your vllm serve
 * `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch
 * `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch
-Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `--all2all-backend` argument must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
+Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `VLLM_ALL2ALL_BACKEND` environment variable must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
 Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled.
-EX: `vllm serve deepseek-ai/DeepSeek-V2-Lite --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo --all2all-backend deepep_low_latency`
+EX: `VLLM_ALL2ALL_BACKEND=deepep_low_latency vllm serve --model="deepseek-ai/DeepSeek-V2-Lite" --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo`
 Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES`
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin
 IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
 ```python
-IOProcessorInput = TypeVar("IOProcessorInput")
+IOProcessorInput = TypeVar('IOProcessorInput')
-IOProcessorOutput = TypeVar("IOProcessorOutput")
+IOProcessorOutput = TypeVar('IOProcessorOutput')
 class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
@ -21,32 +21,30 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
    def pre_process(
        self,
        prompt: IOProcessorInput,
-        request_id: str | None = None,
+        request_id: Optional[str] = None,
        **kwargs,
-    ) -> PromptType | Sequence[PromptType]:
+    ) -> Union[PromptType, Sequence[PromptType]]:
        raise NotImplementedError
    async def pre_process_async(
        self,
        prompt: IOProcessorInput,
-        request_id: str | None = None,
+        request_id: Optional[str] = None,
        **kwargs,
-    ) -> PromptType | Sequence[PromptType]:
+    ) -> Union[PromptType, Sequence[PromptType]]:
        return self.pre_process(prompt, request_id, **kwargs)
    @abstractmethod
-    def post_process(
+    def post_process(self,
-        self,
+                     model_output: Sequence[PoolingRequestOutput],
-        model_output: Sequence[PoolingRequestOutput],
+                     request_id: Optional[str] = None,
-        request_id: str | None = None,
+                     **kwargs) -> IOProcessorOutput:
        **kwargs,
    ) -> IOProcessorOutput:
        raise NotImplementedError
    async def post_process_async(
        self,
        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
-        request_id: str | None = None,
+        request_id: Optional[str] = None,
        **kwargs,
    ) -> IOProcessorOutput:
        collected_output = [item async for i, item in model_output]
@ -58,8 +56,7 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
    @abstractmethod
    def output_to_response(
-        self, plugin_output: IOProcessorOutput
+            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
    ) -> IOProcessorResponse:
        raise NotImplementedError
 ```
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
    from collections.abc import Sequence
    from dataclasses import dataclass
    from enum import Enum, auto
-    from typing import TYPE_CHECKING
+    from typing import TYPE_CHECKING, Optional
    import torch
@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
        @abstractmethod
        def update_state(
            self,
-            batch_update: "BatchUpdate" | None,
+            batch_update: Optional["BatchUpdate"],
        ) -> None:
            """Called when there are new output tokens, prior
            to each forward pass.
@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
    * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
    * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
-* `update_state(self, batch_update: "BatchUpdate" | None) -> None`:
+* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
    * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
    * Use the `BatchUpdate` members to update logits processor internal state
    * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -478,17 +478,15 @@ us with:
 ```python
 if seq_group.is_finished():
-    if (
+    if (seq_group.metrics.first_scheduled_time is not None and
-        seq_group.metrics.first_scheduled_time is not None
+            seq_group.metrics.first_token_time is not None):
        and seq_group.metrics.first_token_time is not None
    ):
        time_queue_requests.append(
            seq_group.metrics.first_scheduled_time -
-            seq_group.metrics.arrival_time
+            seq_group.metrics.arrival_time)
        )
    ...
    if seq_group.metrics.time_in_queue is not None:
-        time_in_queue_requests.append(seq_group.metrics.time_in_queue)
+        time_in_queue_requests.append(
            seq_group.metrics.time_in_queue)
 ```
 This seems duplicative, and one of them should be removed. The latter
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@ -112,8 +112,8 @@ class KVCacheBlock:
    ref_cnt: int
    # The pointers to form a doubly linked list for the free queue.
-    prev_free_block: "KVCacheBlock | None" = None
+    prev_free_block: Optional["KVCacheBlock"] = None
-    next_free_block: "KVCacheBlock | None" = None
+    next_free_block: Optional["KVCacheBlock"] = None
 ```
 There are two design points to highlight:
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@ -93,6 +93,7 @@ The contrived example below implements a custom logits processor which consumes
 ??? code "Example custom logits processor definition"
    ``` python
    from typing import Optional
    import torch
    from vllm.config import VllmConfig
    from vllm.sampling_params import SamplingParams
@ -111,7 +112,7 @@ The contrived example below implements a custom logits processor which consumes
            """Never impacts greedy sampling"""
            return False
-        def update_state(self, batch_update: BatchUpdate | None):
+        def update_state(self, batch_update: Optional[BatchUpdate]):
            if not batch_update:
                return
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter.
    sampling_params = SamplingParams(
        temperature=0,
        max_tokens=256,
-        stop=["[/assistant]"],
+        stop=["[/assistant]"]
    )
    prompts = [
@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter.
    outputs = llm.generate(
        prompts,
        sampling_params,
-        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path),
+        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
    )
    ```
@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin:
                lora_request = LoRARequest(
                    lora_name=lora_name,
                    lora_path=local_path,
-                    lora_int_id=abs(hash(lora_name)),
+                    lora_int_id=abs(hash(lora_name))
                )
                return lora_request
        ```
@ -296,7 +296,10 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
        if has_audio:
            question = f"<|audio|>{question}"
        chat = [
-            {"role": "user", "content": question},
+            {
                "role": "user",
                "content": question
            }
        ]
        return tokenizer.apply_chat_template(chat, tokenize=False)
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -154,7 +154,9 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
    outputs = llm.generate({
        "prompt": prompt,
-        "multi_modal_data": {"image": [image1, image2]},
+        "multi_modal_data": {
            "image": [image1, image2]
        },
    })
    for o in outputs:
@ -181,24 +183,21 @@ conversation = [
    {"role": "assistant", "content": "Hello! How can I assist you today?"},
    {
        "role": "user",
-        "content": [
+        "content": [{
-            {
+            "type": "image_url",
-                "type": "image_url",
+            "image_url": {
-                "image_url": {"url": image_url},
+                "url": image_url
-            },
+            }
-            {
+        },{
-                "type": "image_pil",
+            "type": "image_pil",
-                "image_pil": image_pil,
+            "image_pil": image_pil
-            },
+        }, {
-            {
+            "type": "image_embeds",
-                "type": "image_embeds",
+            "image_embeds": image_embeds
-                "image_embeds": image_embeds,
+        }, {
-            },
+            "type": "text",
-            {
+            "text": "What's in these images?"
-                "type": "text",
+        }],
                "text": "What's in these images?",
            },
        ],
    },
 ]
@ -225,10 +224,7 @@ Multi-image input can be extended to perform video captioning. We show this with
    message = {
        "role": "user",
        "content": [
-            {
+            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
                "type": "text",
                "text": "Describe this set of frames. Consider the frames to be a part of the same video.",
            },
        ],
    }
    for i in range(len(video_frames)):
@ -259,13 +255,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
    # Custom black background for dark theme
    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
-        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}},
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
    )
    # Custom brand color background (e.g., blue)
    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
-        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}},
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
    )
    ```
@ -298,23 +294,20 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
        limit_mm_per_prompt={"video": 1},
    )
-    sampling_params = SamplingParams(max_tokens=1024)
+    sampling_params = SamplingParams(
        max_tokens=1024,
    )
    video_messages = [
-        {
+        {"role": "system", "content": "You are a helpful assistant."},
-            "role": "system",
+        {"role": "user", "content": [
            "content": "You are a helpful assistant.",
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "describe this video."},
                {
                    "type": "video",
                    "video": video_path,
                    "total_pixels": 20480 * 28 * 28,
-                    "min_pixels": 16 * 28 * 28,
+                    "min_pixels": 16 * 28 * 28
-                },
+                }
            ]
        },
    ]
@ -472,24 +465,21 @@ Then, you can use the OpenAI client as follows:
    chat_response = client.chat.completions.create(
        model="microsoft/Phi-3.5-vision-instruct",
-        messages=[
+        messages=[{
-            {
+            "role": "user",
-                "role": "user",
+            "content": [
-                "content": [
+                # NOTE: The prompt formatting with the image token `<image>` is not needed
-                    # NOTE: The prompt formatting with the image token `<image>` is not needed
+                # since the prompt will be processed automatically by the API server.
-                    # since the prompt will be processed automatically by the API server.
+                {"type": "text", "text": "What’s in this image?"},
-                    {
+                {
-                        "type": "text",
+                    "type": "image_url",
-                        "text": "What’s in this image?",
+                    "image_url": {
                        url": image_url
                    },
-                    {
+                    "uuid": image_url # Optional
-                        "type": "image_url",
+                },
-                        "image_url": {"url": image_url},
+            ],
-                        "uuid": image_url,  # Optional
+        }],
                    },
                ],
            }
        ],
    )
    print("Chat completion output:", chat_response.choices[0].message.content)
@ -499,27 +489,26 @@ Then, you can use the OpenAI client as follows:
    chat_response = client.chat.completions.create(
        model="microsoft/Phi-3.5-vision-instruct",
-        messages=[
+        messages=[{
-            {
+            "role": "user",
-                "role": "user",
+            "content": [
-                "content": [
+                {"type": "text", "text": "What are the animals in these images?"},
-                    {
+                {
-                        "type": "text",
+                    "type": "image_url",
-                        "text": "What are the animals in these images?",
+                    "image_url": {
                        "url": image_url_duck
                    },
-                    {
+                    "uuid": image_url_duck # Optional
-                        "type": "image_url",
+                },
-                        "image_url": {"url": image_url_duck},
+                {
-                        "uuid": image_url_duck,  # Optional
+                    "type": "image_url",
                    "image_url": {
                        "url": image_url_lion
                    },
-                    {
+                    "uuid": image_url_lion # Optional
-                        "type": "image_url",
+                },
-                        "image_url": {"url": image_url_lion},
+            ],
-                        "uuid": image_url_lion,  # Optional
+        }],
                    },
                ],
            }
        ],
    )
    print("Chat completion output:", chat_response.choices[0].message.content)
    ```
@ -571,22 +560,23 @@ Then, you can use the OpenAI client as follows:
    ## Use video url in the payload
    chat_completion_from_url = client.chat.completions.create(
-        messages=[
+        messages=[{
-            {
+            "role":
-                "role": "user",
+            "user",
-                "content": [
+            "content": [
-                    {
+                {
-                        "type": "text",
+                    "type": "text",
-                        "text": "What's in this video?",
+                    "text": "What's in this video?"
                },
                {
                    "type": "video_url",
                    "video_url": {
                        "url": video_url
                    },
-                    {
+                    "uuid": video_url # Optional
-                        "type": "video_url",
+                },
-                        "video_url": {"url": video_url},
+            ],
-                        "uuid": video_url,  # Optional
+        }],
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=64,
    )
@ -662,25 +652,23 @@ Then, you can use the OpenAI client as follows:
    audio_base64 = encode_base64_content_from_url(audio_url)
    chat_completion_from_base64 = client.chat.completions.create(
-        messages=[
+        messages=[{
-            {
+            "role": "user",
-                "role": "user",
+            "content": [
-                "content": [
+                {
-                    {
+                    "type": "text",
-                        "type": "text",
+                    "text": "What's in this audio?"
-                        "text": "What's in this audio?",
+                },
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": audio_base64,
                        "format": "wav"
                    },
-                    {
+                    "uuid": audio_url # Optional
-                        "type": "input_audio",
+                },
-                        "input_audio": {
+            ],
-                            "data": audio_base64,
+        }],
                            "format": "wav",
                        },
                        "uuid": audio_url,  # Optional
                    },
                ],
            },
        ],
        model=model,
        max_completion_tokens=64,
    )
@ -695,22 +683,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
    ```python
    chat_completion_from_url = client.chat.completions.create(
-        messages=[
+        messages=[{
-            {
+            "role": "user",
-                "role": "user",
+            "content": [
-                "content": [
+                {
-                    {
+                    "type": "text",
-                        "type": "text",
+                    "text": "What's in this audio?"
-                        "text": "What's in this audio?",
+                },
                {
                    "type": "audio_url",
                    "audio_url": {
                        "url": audio_url
                    },
-                    {
+                    "uuid": audio_url # Optional
-                        "type": "audio_url",
+                },
-                        "audio_url": {"url": audio_url},
+            ],
-                        "uuid": audio_url,  # Optional
+        }],
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=64,
    )
@ -759,48 +747,43 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
    # Basic usage - this is equivalent to the LLaVA example for offline inference
    model = "llava-hf/llava-1.5-7b-hf"
-    embeds = {
+    embeds =  {
        "type": "image_embeds",
        "image_embeds": f"{base64_image_embedding}",
-        "uuid": image_url,  # Optional
+        "uuid": image_url # Optional
    }
    # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
    model = "Qwen/Qwen2-VL-2B-Instruct"
-    embeds = {
+    embeds =  {
        "type": "image_embeds",
        "image_embeds": {
-            "image_embeds": f"{base64_image_embedding}",  # Required
+            "image_embeds": f"{base64_image_embedding}" , # Required
-            "image_grid_thw": f"{base64_image_grid_thw}",  # Required by Qwen/Qwen2-VL-2B-Instruct
+            "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
        },
-        "uuid": image_url,  # Optional
+        "uuid": image_url # Optional
    }
    model = "openbmb/MiniCPM-V-2_6"
-    embeds = {
+    embeds =  {
        "type": "image_embeds",
        "image_embeds": {
-            "image_embeds": f"{base64_image_embedding}",  # Required
+            "image_embeds": f"{base64_image_embedding}" , # Required
-            "image_sizes": f"{base64_image_sizes}",  # Required by openbmb/MiniCPM-V-2_6
+            "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
        },
-        "uuid": image_url,  # Optional
+        "uuid": image_url # Optional
    }
    chat_completion = client.chat.completions.create(
        messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            {
-                "role": "system",
+                "type": "text",
-                "content": "You are a helpful assistant.",
+                "text": "What's in this image?",
            },
-            {
+            embeds,
-                "role": "user",
+            ],
-                "content": [
+        },
-                    {
+    ],
                        "type": "text",
                        "text": "What's in this image?",
                    },
                    embeds,
                ],
            },
        ],
        model=model,
    )
    ```
@ -819,22 +802,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
        {
            "type": "image_embeds",
            "image_embeds": None,
-            "uuid": image_uuid,
+            "uuid": image_uuid
        },
        # input_audio:
        {
            "type": "input_audio",
            "input_audio": None,
-            "uuid": audio_uuid,
+            "uuid": audio_uuid
        },
        # PIL Image:
        {
            "type": "image_pil",
-            "image_pil": None,
+            "image_pil": None
-            "uuid": image_uuid,
+            "uuid": image_uuid
-        },
+        }
    ```
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@ -156,16 +156,6 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
    NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`).
    Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior.
 ## Experimental Feature
 ### Heterogenuous KV Layout support
 Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration
 ```bash
 --kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
 ```
 ## Example Scripts/Code
 Refer to these example scripts in the vLLM repository:
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@ -1,9 +1,5 @@
 # AutoAWQ
 > ⚠️ **Warning:**
    The `AutoAWQ` library is deprecated. This functionality has been adopted by the vLLM project in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq).
    For the recommended quantization workflow, please see the AWQ examples in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq). For more details on the deprecation, refer to the original [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ).
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
 The main benefits are lower latency and memory usage.
@ -22,15 +18,13 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
    from awq import AutoAWQForCausalLM
    from transformers import AutoTokenizer
-    model_path = "mistralai/Mistral-7B-Instruct-v0.2"
+    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-    quant_path = "mistral-instruct-v0.2-awq"
+    quant_path = 'mistral-instruct-v0.2-awq'
-    quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
+    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
    # Load model
    model = AutoAWQForCausalLM.from_pretrained(
-        model_path,
+        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
        low_cpu_mem_usage=True,
        use_cache=False,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
--- a/docs/features/quantization/auto_round.md
+++ b/docs/features/quantization/auto_round.md
@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 model_name = "Qwen/Qwen3-0.6B"
-model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 bits, group_size, sym = 4, 128, True
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@ -34,7 +34,7 @@ llm = LLM(
    model=model_id,
    dtype=torch.bfloat16,
    trust_remote_code=True,
-    quantization="bitblas",
+    quantization="bitblas"
 )
 ```
@ -53,6 +53,6 @@ llm = LLM(
        dtype=torch.float16,
        trust_remote_code=True,
        quantization="bitblas",
-        max_model_len=1024,
+        max_model_len=1024
    )
    ```
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(
    model=model_id,
    dtype=torch.bfloat16,
-    trust_remote_code=True,
+    trust_remote_code=True
 )
 ```
@ -43,7 +43,7 @@ llm = LLM(
    model=model_id,
    dtype=torch.bfloat16,
    trust_remote_code=True,
-    quantization="bitsandbytes",
+    quantization="bitsandbytes"
 )
 ```
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@ -41,9 +41,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
+    MODEL_ID, device_map="auto", torch_dtype="auto",
    device_map="auto",
    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@ -65,10 +63,7 @@ Since simple RTN does not require data for weight quantization and the activatio
    # Configure the simple PTQ quantization
    recipe = QuantizationModifier(
-        targets="Linear",
+      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
        scheme="FP8_DYNAMIC",
        ignore=["lm_head"],
    )
    # Apply the quantization algorithm.
    oneshot(model=model, recipe=recipe)
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
      conversation = [
         {
            "role": "system",
-            "content": "You are a helpful assistant",
+            "content": "You are a helpful assistant"
         },
         {
            "role": "user",
-            "content": "Hello",
+            "content": "Hello"
         },
         {
            "role": "assistant",
-            "content": "Hello! How can I assist you today?",
+            "content": "Hello! How can I assist you today?"
         },
         {
            "role": "user",
@ -67,10 +67,8 @@ You can also use the GGUF model directly through the LLM entrypoint:
      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
      # Create an LLM.
-      llm = LLM(
+      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-         model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
      )
      # Generate texts from the prompts. The output is a list of RequestOutput objects
      # that contain the prompt, generated text, and other information.
      outputs = llm.chat(conversation, sampling_params)
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
    calibration_dataset = load_dataset(
        "allenai/c4",
        data_files="en/c4-train.00001-of-01024.json.gz",
-        split="train",
+        split="train"
    ).select(range(1024))["text"]
    quant_config = QuantizeConfig(bits=4, group_size=128)
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@ -39,9 +39,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
+    MODEL_ID, device_map="auto", torch_dtype="auto",
    device_map="auto",
    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@ -168,7 +166,7 @@ The following is an example of an expanded quantization recipe you can tune to y
        },
        ignore=["lm_head"],
        update_size=NUM_CALIBRATION_SAMPLES,
-        dampening_frac=0.01,
+        dampening_frac=0.01
    )
    ```
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@ -44,9 +44,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
+    MODEL_ID, device_map="auto", torch_dtype="auto",
    device_map="auto",
    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
    from vllm import LLM, SamplingParams
    def main():
        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
-        # Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
+        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
        llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@ -41,11 +41,9 @@ Here is an example of how to enable FP8 quantization:
    from vllm import LLM, SamplingParams
    sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-    llm = LLM(
+    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-        model="meta-llama/Llama-2-7b-chat-hf",
+            kv_cache_dtype="fp8",
-        kv_cache_dtype="fp8",
+            calculate_kv_scales=True)
        calculate_kv_scales=True,
    )
    prompt = "London is the capital of"
    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
    print(out)
@ -82,7 +80,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
    # Select model and load it
    MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    # Select calibration dataset
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@ -48,9 +48,7 @@ to fetch model and tokenizer.
    MAX_SEQ_LEN = 512
    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
+        MODEL_ID, device_map="auto", torch_dtype="auto",
        device_map="auto",
        dtype="auto",
    )
    model.eval()
@ -77,18 +75,10 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
    dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
    text_data = dataset["text"][:NUM_CALIBRATION_DATA]
-    tokenized_outputs = tokenizer(
+    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
-        text_data,
+        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
-        return_tensors="pt",
+    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
-        padding=True,
+        batch_size=BATCH_SIZE, drop_last=True)
        truncation=True,
        max_length=MAX_SEQ_LEN,
    )
    calib_dataloader = DataLoader(
        tokenized_outputs['input_ids'],
        batch_size=BATCH_SIZE,
        drop_last=True,
    )
    ```
 ### 3. Set the Quantization Configuration
@ -113,32 +103,26 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
                                        load_quant_algo_config_from_file)
    # Define fp8/per-tensor/static spec.
-    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-        observer_method="min_max",
+        is_dynamic=False).to_quantization_spec()
        is_dynamic=False,
    ).to_quantization_spec()
    # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-    global_quant_config = QuantizationConfig(
+    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-        input_tensors=FP8_PER_TENSOR_SPEC,
+        weight=FP8_PER_TENSOR_SPEC)
        weight=FP8_PER_TENSOR_SPEC,
    )
    # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
    KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
    kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-    kv_cache_quant_config = {
+    kv_cache_quant_config = {name :
-        name: QuantizationConfig(
+        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
-            input_tensors=global_quant_config.input_tensors,
+                        weight=global_quant_config.weight,
-            weight=global_quant_config.weight,
+                        output_tensors=KV_CACHE_SPEC)
-            output_tensors=KV_CACHE_SPEC,
+        for name in kv_cache_layer_names_for_llama}
        )
        for name in kv_cache_layer_names_for_llama
    }
    layer_quant_config = kv_cache_quant_config.copy()
    # Define algorithm config by config file.
-    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
    algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
    EXCLUDE_LAYERS = ["lm_head"]
@ -147,8 +131,7 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
        layer_quant_config=layer_quant_config,
        kv_cache_quant_config=kv_cache_quant_config,
        exclude=EXCLUDE_LAYERS,
-        algo_config=algo_config,
+        algo_config=algo_config)
    )
    ```
 ### 4. Quantize the Model and Export
@ -182,11 +165,8 @@ for more exporting format details.
    EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
    exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
    with torch.no_grad():
-        exporter.export_safetensors_model(
+        exporter.export_safetensors_model(freezed_model,
-            freezed_model,
+            quant_config=quant_config, tokenizer=tokenizer)
            quant_config=quant_config,
            tokenizer=tokenizer,
        )
    ```
 ### 5. Evaluation in vLLM
@ -209,11 +189,8 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    # Create an LLM.
-    llm = LLM(
+    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-        model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+            kv_cache_dtype='fp8',quantization='quark')
        kv_cache_dtype="fp8",
        quantization="quark",
    )
    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
    quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
    quantized_model = AutoModelForCausalLM.from_pretrained(
        model_name,
-        dtype="auto",
+        torch_dtype="auto",
        device_map="auto",
        quantization_config=quantization_config
    )
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -11,9 +11,6 @@ vLLM currently supports the following reasoning models:
 | Model Series | Parser Name | Structured Output Support | Tool Calling |
 |--------------|-------------|------------------|-------------|
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
 | [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
 | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
 | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
@ -21,9 +18,8 @@ vLLM currently supports the following reasoning models:
 | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
 !!! note
-    IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
+    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
    The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
    DeepSeek-V3.1 tool calling is supported in non-thinking mode.
 ## Quickstart
@ -119,11 +115,9 @@ OpenAI Python client library does not officially support `reasoning_content` att
    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
-    stream = client.chat.completions.create(
+    stream = client.chat.completions.create(model=model,
-        model=model,
+                                            messages=messages,
-        messages=messages,
+                                            stream=True)
        stream=True,
    )
    print("client: Start streaming chat completions...")
    printed_reasoning_content = False
@ -163,29 +157,27 @@ The reasoning content is also available when both tool calling and the reasoning
    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
-    tools = [
+    tools = [{
-        {
+        "type": "function",
-            "type": "function",
+        "function": {
-            "function": {
+            "name": "get_weather",
-                "name": "get_weather",
+            "description": "Get the current weather in a given location",
-                "description": "Get the current weather in a given location",
+            "parameters": {
-                "parameters": {
+                "type": "object",
-                    "type": "object",
+                "properties": {
-                    "properties": {
+                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                        "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
-                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
-                    },
+                "required": ["location", "unit"]
-                    "required": ["location", "unit"],
+            }
                }
            },
        }
-    ]
+    }]
    response = client.chat.completions.create(
        model=client.models.list().data[0].id,
        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
        tools=tools,
-        tool_choice="auto",
+        tool_choice="auto"
    )
    print(response)
@ -231,7 +223,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
            previous_token_ids: Sequence[int],
            current_token_ids: Sequence[int],
            delta_token_ids: Sequence[int],
-        ) -> DeltaMessage | None:
+        ) -> Union[DeltaMessage, None]:
            """
            Instance method that should be implemented for extracting reasoning
            from an incomplete response; for use when handling reasoning calls and
@ -241,10 +233,8 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
            """
        def extract_reasoning_content(
-            self,
+                self, model_output: str, request: ChatCompletionRequest
-            model_output: str,
+        ) -> tuple[Optional[str], Optional[str]]:
            request: ChatCompletionRequest | ResponsesRequest,
        ) -> tuple[str | None, str | None]:
            """
            Extract reasoning content from a complete model-generated string.
@ -282,10 +272,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
        @classmethod
        def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-            return cls(
+            return cls(start_token_id=tokenizer.encode(
-                start_token_id=tokenizer.encode("<think>", add_special_tokens=False)[0],
+                "<think>", add_special_tokens=False)[0],
-                end_token_id=tokenizer.encode("</think>", add_special_tokens=False)[0],
+                    end_token_id=tokenizer.encode("</think>",
-            )
+                                                    add_special_tokens=False)[0])
        def is_reasoning_end(self, input_ids: list[int]) -> bool:
            return self.end_token_id in input_ids
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -27,29 +27,27 @@ Next, make a request that triggers the model to use the available tools:
        return f"Getting the weather for {location} in {unit}..."
    tool_functions = {"get_weather": get_weather}
-    tools = [
+    tools = [{
-        {
+        "type": "function",
-            "type": "function",
+        "function": {
-            "function": {
+            "name": "get_weather",
-                "name": "get_weather",
+            "description": "Get the current weather in a given location",
-                "description": "Get the current weather in a given location",
+            "parameters": {
-                "parameters": {
+                "type": "object",
-                    "type": "object",
+                "properties": {
-                    "properties": {
+                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                        "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                    },
                    "required": ["location", "unit"],
                },
-            },
+                "required": ["location", "unit"]
-        },
+            }
-    ]
+        }
    }]
    response = client.chat.completions.create(
        model=client.models.list().data[0].id,
        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
        tools=tools,
-        tool_choice="auto",
+        tool_choice="auto"
    )
    tool_call = response.choices[0].message.tool_calls[0].function
@ -352,16 +350,6 @@ Supported models:
 Flags: `--tool-call-parser qwen3_xml`
 ### Olmo 3 Models (`olmo3`)
 Olmo 3 models output tool calls in a format that is very similar to the one expected by the `pythonic` parser (see below), with a few differences. Each tool call is a pythonic string, but the parallel tool calls are newline-delimited, and the calls are wrapped within XML tags as `<function_calls>..</function_calls>`. In addition, the parser also allows JSON boolean and null literals (`true`, `false`, and `null`) in addition to the pythonic ones (`True`, `False`, and `None`).
 Supported models:
 * TODO (will be updated after Olmo 3 release)
 Flags: `--tool-call-parser olmo3`
 ### Models with Pythonic Tool Calls (`pythonic`)
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
@ -414,7 +402,8 @@ Here is a summary of a plugin file:
        # adjust request. e.g.: set skip special tokens
        # to False for tool call output.
-        def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        def adjust_request(
                self, request: ChatCompletionRequest) -> ChatCompletionRequest:
            return request
        # implement the tool call parse for stream call
@ -427,7 +416,7 @@ Here is a summary of a plugin file:
            current_token_ids: Sequence[int],
            delta_token_ids: Sequence[int],
            request: ChatCompletionRequest,
-        ) -> DeltaMessage | None:
+        ) -> Union[DeltaMessage, None]:
            return delta
        # implement the tool parse for non-stream call
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@ -23,46 +23,7 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
-First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+--8<-- "docs/getting_started/installation/cpu/build.inc.md"
 ```bash
 sudo apt-get update  -y
 sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 Second, clone the vLLM project:
 ```bash
 git clone https://github.com/vllm-project/vllm.git vllm_source
 cd vllm_source
 ```
 Third, install required dependencies:
 ```bash
 uv pip install -r requirements/cpu-build.txt --torch-backend cpu
 uv pip install -r requirements/cpu.txt --torch-backend cpu
 ```
 ??? console "pip"
    ```bash
    pip install --upgrade pip
    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
    ```
 Finally, build and install vLLM:
 ```bash
 VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
 ```
 If you want to develop vLLM, install it in editable mode instead.
 ```bash
 VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
 ```
 Testing has been conducted on AWS Graviton3 instances for compatibility.
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@ -0,0 +1,45 @@
 First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
 ```bash
 sudo apt-get update  -y
 sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 Second, clone the vLLM project:
 ```bash
 git clone https://github.com/vllm-project/vllm.git vllm_source
 cd vllm_source
 ```
 Third, install required dependencies:
 ```bash
 uv pip install -r requirements/cpu-build.txt --torch-backend cpu
 uv pip install -r requirements/cpu.txt --torch-backend cpu
 ```
 ??? console "pip"
    ```bash
    pip install --upgrade pip
    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
    ```
 Finally, build and install vLLM:
 ```bash
 VLLM_TARGET_DEVICE=cpu python setup.py install
 ```
 If you want to develop vLLM, install it in editable mode instead.
 ```bash
 VLLM_TARGET_DEVICE=cpu python setup.py develop
 ```
 !!! note
    If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
 # --8<-- [end:extra-information]
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@ -194,10 +194,8 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
        api_key=openai_api_key,
        base_url=openai_api_base,
    )
-    completion = client.completions.create(
+    completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
-        model="Qwen/Qwen2.5-1.5B-Instruct",
+                                        prompt="San Francisco is a")
        prompt="San Francisco is a",
    )
    print("Completion result:", completion)
    ```
@ -241,7 +239,7 @@ Alternatively, you can use the `openai` Python package:
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Tell me a joke."},
-        ],
+        ]
    )
    print("Chat response:", chat_response)
    ```
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@ -22,11 +22,6 @@ sys.modules["vllm._C"] = MagicMock()
 class PydanticMagicMock(MagicMock):
    """`MagicMock` that's able to generate pydantic-core schemas."""
    def __init__(self, *args, **kwargs):
        name = kwargs.pop("name", None)
        super().__init__(*args, **kwargs)
        self.__spec__ = importlib.machinery.ModuleSpec(name, None)
    def __get_pydantic_core_schema__(self, source_type, handler):
        return core_schema.any_schema()
@ -47,9 +42,7 @@ def auto_mock(module, attr, max_mocks=50):
            raise e
        except ModuleNotFoundError as e:
            logger.info("Mocking %s for argparse doc generation", e.name)
-            sys.modules[e.name] = PydanticMagicMock(name=e.name)
+            sys.modules[e.name] = PydanticMagicMock()
        except Exception as e:
            logger.warning("Failed to import %s.%s: %s", module, attr, e)
    raise ImportError(
        f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
--- a/docs/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@ -60,7 +60,7 @@ from vllm import LLM
 llm = LLM(
    "s3://my-bucket/vllm/facebook/opt-125m/v1", 
    load_format="tensorizer",
-    enable_lora=True,
+    enable_lora=True
 )
 ```
@ -97,6 +97,6 @@ llm = LLM(
    "s3://my-bucket/vllm/facebook/opt-125m/v1", 
    load_format="tensorizer",
    enable_lora=True,
-    model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
+    model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}
 )
 ```
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Woosuk Kwon	22bf5c5077	fix Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-10-11 11:38:33 -07:00
Woosuk Kwon	3a8990743e	add truncation Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-10-11 11:20:31 -07:00
Woosuk Kwon	fbc2cc8217	merge	2025-10-11 11:09:22 -07:00
Woosuk Kwon	efd4bc967d	[Misc] Remove in ModelRunnerOutput Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-08-23 21:09:20 -07:00
		`@ -1 +0,0 @@`
			`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`