fix missing removal

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
[Misc] Remove unused virtual engine flag
2025-10-17 11:35:42 -07:00 · 2025-10-16 23:04:05 -07:00 · 2025-10-17 04:48:18 +00:00 · 2025-10-17 04:43:16 +00:00 · 2025-10-17 11:45:32 +08:00 · 2025-10-17 03:37:52 +00:00
602 changed files with 19486 additions and 8249 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -5,11 +5,11 @@ import os
 import sys
 import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
 # Note that we have 800 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
 def print_top_10_largest_files(zip_file):
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.419
  - name: "exact_match,flexible-extract"
    value: 0.416
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@ -0,0 +1,11 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 backend: "vllm-vlm"
 tasks:
 - name: "chartqa"
  metrics:
  - name: "relaxed_accuracy,none"
    value: 0.90
 limit: 100
 num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -0,0 +1,11 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 backend: "vllm-vlm"
 tasks:
 - name: "mmlu_pro"
  metrics:
  - name: "exact_match,custom-extract"
    value: 0.80
 limit: 250 # will run on 250 * 14 subjects = 3500 samples
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+# For vllm script, with -t option (tensor parallel size)
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
 model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
 backend: "vllm-vlm"
 tasks:
 - name: "chartqa"
  metrics:
  - name: "relaxed_accuracy,none"
    value: 0.855
 limit: 2500
 num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
@ -0,0 +1 @@
 Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@ -0,0 +1 @@
 Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@ -0,0 +1 @@
 Qwen2.5-VL-7B-Instruct.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install lm-eval==0.4.9
 usage() {
    echo``
    echo "Runs lm eval harness on ChartQA using multimodal vllm."
    echo "This pathway is intended to be used to create baselines for "
    echo "our correctness tests in vllm's CI."
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -l    - limit number of samples to run"
    echo "  -t    - tensor parallel size to run at"
    echo
 }
 while getopts "m:l:t:" OPT; do
  case ${OPT} in
    m ) 
        MODEL="$OPTARG"
        ;;
    l ) 
        LIMIT="$OPTARG"
        ;;
    t ) 
        TP_SIZE="$OPTARG"
        ;;
    \? ) 
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model vllm-vlm \
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@ -0,0 +1,50 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on MMLUPRO for vllm.
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 usage() {
    echo``
    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
    echo "This pathway is intended to be used to create baselines for "
    echo "our automated nm-test-accuracy workflow"
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -l    - limit number of samples to run"
    echo "  -f    - number of fewshot samples to use"
    echo "  -t    - tensor parallel size to run at"
    echo
 }
 while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
    b )
        BATCH_SIZE="$OPTARG"
        ;;
    l )
        LIMIT="$OPTARG"
        ;;
    f )
        FEWSHOT="$OPTARG"
        ;;
    t )
        TP_SIZE="$OPTARG"
        ;;
    \? )
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model vllm \
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size auto
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -19,21 +19,27 @@ RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
    batch_size = eval_config.get("batch_size", "auto")
    backend = eval_config.get("backend", "vllm")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
        f"enforce_eager=true,"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len}"
+        f"max_model_len={max_model_len},"
    )
    results = lm_eval.simple_evaluate(
-        model="vllm",
+        model=backend,
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        batch_size="auto",
+        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
        # text models. however, this is regressing measured strict-match for
        # existing text models in CI, so only apply it for mm.
        apply_chat_template=backend == "vllm-vlm",
        batch_size=batch_size,
    )
    return results
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -8,7 +8,7 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@ -76,7 +76,7 @@ steps:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
  # Add job to create multi-arch manifest
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -527,8 +527,9 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
+  # TODO(jerryzh168): resolve the above comment
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  - uv pip install --system torchao==0.13.0
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -733,6 +734,16 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
--- a/.coveragerc
+++ b/.coveragerc
@ -1,5 +1,10 @@
 [run]
-source = vllm
+# Track the installed vllm package (this is what actually gets imported during tests)
 # Use wildcard pattern to match the installed location
 source =
    vllm
    */dist-packages/vllm
    */site-packages/vllm
 omit =
    */tests/*
    */test_*
@ -12,6 +17,16 @@ omit =
    */benchmarks/*
    */docs/*
 [paths]
 # Map all possible vllm locations to a canonical "vllm" path
 # This ensures coverage.combine properly merges data from different test runs
 source =
    vllm
    /vllm-workspace/src/vllm
    /vllm-workspace/vllm
    */site-packages/vllm
    */dist-packages/vllm
 [report]
 exclude_lines =
    pragma: no cover
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -0,0 +1,4 @@
 # Migrate from `yapf` & `isort` to `ruff`
 d6953beb91da4e9c99be4c0a1304a2d24189535c
 # Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
 8fcaaf6a165e661f63fc51be906bc05b0767332f
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,9 +5,7 @@
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
@ -26,7 +24,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
@ -60,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/offloading @ApostaC
 # Transformers backend
-/vllm/model_executor/models/transformers.py @hmellor
+/vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor
 # Docs
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -13,6 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
        id: label-step
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
@ -42,7 +43,6 @@ jobs:
                    searchIn: "body"
                  },
                ],
                // Substring search - matches anywhere in text (partial matches)
                substrings: [
                  {
@ -89,14 +89,12 @@ jobs:
                    term: "hip_",
                    searchIn: "both"
                  },
                  // ROCm tools and libraries
                  {
                    term: "hipify",
                    searchIn: "both"
                  },
                ],
                // Regex patterns - for complex pattern matching
                regexPatterns: [
                  {
@ -107,13 +105,17 @@ jobs:
                  }
                ],
              },
              // Add more label configurations here as needed
              // example: {
              //   keywords: [...],
              //   substrings: [...],
              //   regexPatterns: [...]
              // },
            };
            // Helper function to create regex based on search type
            function createSearchRegex(term, type) {
              // Escape special regex characters in the term
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
              switch (type) {
                case 'keyword':
                  // Word boundary search - matches whole words only
@ -125,16 +127,13 @@ jobs:
                  throw new Error(`Unknown search type: ${type}`);
              }
            }
            // Helper function to find matching terms in text with line information
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
              const matches = [];
              const lines = text.split('\n');
              for (const termConfig of searchTerms) {
                let regex;
                let term, searchIn, pattern, description, flags;
                // Handle different input formats (string or object)
                if (typeof termConfig === 'string') {
                  term = termConfig;
@ -146,21 +145,17 @@ jobs:
                  description = termConfig.description;
                  flags = termConfig.flags;
                }
                // Skip if this term shouldn't be searched in the current location
                if (searchIn !== 'both' && searchIn !== searchLocation) {
                  continue;
                }
                // Create appropriate regex
                if (searchType === 'regex') {
                  regex = new RegExp(pattern, flags || "gi");
                } else {
                  regex = createSearchRegex(term, searchType);
                }
                const termMatches = [];
                // Check each line for matches
                lines.forEach((line, lineIndex) => {
                  const lineMatches = line.match(regex);
@ -175,15 +170,14 @@ jobs:
                        originalTerm: term || pattern,
                        description: description,
                        // Show context around the match in the line
-                        context: line.length > 100 ? 
+                        context: line.length > 100 ?
-                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
-                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
                          : line.trim()
                      });
                    });
                  }
                });
                if (termMatches.length > 0) {
                  matches.push({
                    term: term || (description || pattern),
@ -196,64 +190,48 @@ jobs:
                  });
                }
              }
              return matches;
            }
            // Helper function to check if label should be added
            async function processLabel(labelName, config) {
              const body = context.payload.issue.body || "";
              const title = context.payload.issue.title || "";
              core.notice(`Processing label: ${labelName}`);
              core.notice(`Issue Title: "${title}"`);
              core.notice(`Issue Body length: ${body.length} characters`);
              let shouldAddLabel = false;
              let allMatches = [];
              let reason = '';
              const keywords = config.keywords || [];
              const substrings = config.substrings || [];
              const regexPatterns = config.regexPatterns || [];
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
              // Search in title
              if (title.trim()) {
                core.notice(`Searching in title: "${title}"`);
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
              }
              // Search in body
              if (body.trim()) {
                core.notice(`Searching in body (${body.length} characters)`);
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
              }
              if (allMatches.length > 0) {
                core.notice(`Found ${allMatches.length} matching term(s):`);
                for (const termMatch of allMatches) {
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
                  if (termMatch.searchType === 'regex') {
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  } else {
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  }
                  // Show details for each match
                  termMatch.matches.forEach((match, index) => {
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -266,7 +244,6 @@ jobs:
                    }
                  });
                }
                shouldAddLabel = true;
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -274,13 +251,10 @@ jobs:
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
              }
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
              if (shouldAddLabel) {
                const existingLabels = context.payload.issue.labels.map(l => l.name);
                if (!existingLabels.includes(labelName)) {
@ -296,14 +270,92 @@ jobs:
                core.notice(`Label "${labelName}" already present.`);
                return false;
              }
              core.notice(`No matching terms found for label "${labelName}".`);
              return false;
            }
            // Process all configured labels
-            const processLabels = Object.entries(labelConfig)
+            const labelsAddedResults = await Promise.all(
-              .map(([labelName, config]) => processLabel(labelName, config));
+              Object.entries(labelConfig).map(([labelName, config]) => 
-            const labelsAdded = await Promise.all(processLabels);
+                processLabel(labelName, config).then(added => ({ labelName, added }))
-            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
+              )
-            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+            );
            const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
            // Return which labels were added for the next step
            const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
            core.setOutput('labels_added', JSON.stringify(addedLabels));
            return addedLabels;
      - name: CC users for labeled issues
        if: steps.label-step.outputs.labels_added != '[]'
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
            // Configuration: Map labels to GitHub users to CC
            // You can add multiple users per label, and multiple label configurations
            const ccConfig = {
              rocm: {
                users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3']
                message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions
              },
              // Add more label -> user mappings here
              // Example:
              // cuda: {
              //   users: ['user1', 'user2'],
              //   message: 'CC {users} for CUDA-related issue'
              // },
              // performance: {
              //   users: ['perfexpert'],
              //   message: 'CC {users} for performance issue'
              // },
            };
            const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
            core.notice(`Labels added: ${labelsAdded.join(', ')}`);
            // Get existing comments to check for already mentioned users
            const comments = await github.rest.issues.listComments({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
            });
            const issueBody = context.payload.issue.body || '';
            const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
            // Process each label that was added
            for (const label of labelsAdded) {
              if (ccConfig[label]) {
                const config = ccConfig[label];
                const usersToMention = [];
                // Check which users haven't been mentioned yet
                for (const user of config.users) {
                  const mentionPattern = new RegExp(`@${user}\\b`, 'i');
                  if (!mentionPattern.test(allExistingText)) {
                    usersToMention.push(user);
                  } else {
                    core.notice(`@${user} already mentioned for label "${label}", skipping`);
                  }
                }
                // Post comment if there are users to mention
                if (usersToMention.length > 0) {
                  const mentions = usersToMention.map(u => `@${u}`).join(' ');
                  const message = config.message.replace('{users}', mentions);
                  await github.rest.issues.createComment({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    issue_number: context.issue.number,
                    body: message
                  });
                  core.notice(`CC comment added for label "${label}": ${mentions}`);
                } else {
                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
                }
              }
            }
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -16,6 +16,7 @@ repos:
  rev: v1.38.1
  hooks:
  - id: typos
    args: [--force-exclude]
 - repo: https://github.com/pre-commit/mirrors-clang-format
  rev: v21.1.2
  hooks:
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -31,6 +31,7 @@ import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from contextlib import nullcontext
 from dataclasses import dataclass
 import datasets
@ -501,15 +502,9 @@ async def benchmark(
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-    # This can be used once the minimum Python version is 3.10 or higher,
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
    # and it will simplify the code in limited_request_func.
    #    semaphore = (asyncio.Semaphore(max_concurrency)
    #                 if max_concurrency else contextlib.nullcontext())
    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
    async def limited_request_func(request_func_input, pbar):
        if semaphore is None:
            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
            return await request_func(request_func_input=request_func_input, pbar=pbar)
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
    else:
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    block_quant_shape = get_weight_block_size_safety(config)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_customized_permute = args.use_customized_permute
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@ -22,10 +22,10 @@ else()
    CONFIGURE_COMMAND ""
    BUILD_COMMAND ""
  )
  FetchContent_Populate(qutlass)
  set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
 endif()
 FetchContent_Populate(qutlass)
 if(NOT qutlass_SOURCE_DIR)
  message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
 endif()
--- a/codecov.yml
+++ b/codecov.yml
@ -0,0 +1,12 @@
 codecov:
  require_ci_to_pass: false
 fixes:
  # Map source code paths to repository root paths
  # Wildcards match any Python version (python3.*)
  - "/vllm-workspace/src/vllm/::vllm/"
  - "/vllm-workspace/vllm/::vllm/"
  - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
  - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
  - "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
  - "/usr/lib/python3.*/site-packages/vllm/::vllm/"
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -125,32 +125,37 @@ public:
  }
  static void set_split_kv (KernelArguments& args) {
    // printf("set_split_kv start");
    if (args.split_kv >= 1) return;
    auto [H, K, D, B] = args.problem_shape;
    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
    int sm_count = args.hw_info.sm_count;
-    // printf("    sm_count = %d\n", sm_count);
+    float seq_length_k = static_cast<float>(K) / 1024.0f;
-    int max_splits = ceil_div(K, 128);
+    int max_splits = 1;
    max_splits = min(16, max_splits);
-    // TODO: This avoids a hang when the batch size larger than 1 and 
+    if (B <= 4 && seq_length_k >= 16) {
-    // there is more than 1 kv_splits. 
+      max_splits = 16;
    // Discuss with NVIDIA how this can be fixed.
    if (B > 1) {
      max_splits = min(1, max_splits);
    }
-    
+    else if (B <= 8 && seq_length_k >= 4) {
-    // printf("    max_splits = %d\n", max_splits);
+      max_splits = 8;
    }
    else if ((B <= 16 && seq_length_k >= 8) ||
             (B == 48 && seq_length_k >= 32)) {
      max_splits = 4;
    }
    else if ((B <= 32 && seq_length_k >= 16) ||
             (B == 96 && seq_length_k >= 16)) {
      max_splits = 2;
    }
    else {
      max_splits = 1;
    }
    // Wave-aware scheduling: ensure integer number of waves in K dimension
    int sms_per_batch = max(1, sm_count / B);
    // printf("    sms_per_batch = %d\n", sms_per_batch);
    int split_heur = min(max_splits, sms_per_batch);
    int waves = ceil_div(B * split_heur, sm_count);
    int k_waves = ceil_div(max_splits, split_heur);
    int split_wave_aware = ceil_div(max_splits, k_waves);
    args.split_kv = split_wave_aware;
    // printf("    args.split_kv = %d\n", args.split_kv);
  }
  /// Determines whether the GEMM can execute the given problem.
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@ -5,11 +5,11 @@
 namespace vllm {
-// vllm_kernel_override_batch_invariant(); returns true
+// vllm_is_batch_invariant(); returns true
-// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
+// if env VLLM_BATCH_INVARIANT=1
-inline bool vllm_kernel_override_batch_invariant() {
+inline bool vllm_is_batch_invariant() {
  static bool cached = []() {
-    std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
+    std::string env_key = "VLLM_BATCH_INVARIANT";
    const char* val = std::getenv(env_key.c_str());
    return (val && std::atoi(val) != 0) ? 1 : 0;
  }();
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -2,6 +2,7 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include "quantization/vectorization_utils.cuh"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
    const float epsilon, const int num_tokens, const int hidden_size) {
  __shared__ float s_variance;
  float variance = 0.0f;
  const scalar_t* input_row = input + blockIdx.x * input_stride;
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+  constexpr int VEC_SIZE = 8;
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
      float x = static_cast<float>(vec.val[i]);
      variance += x * x;
    }
  };
  auto scalar_op = [&variance](const scalar_t& val) {
    float x = static_cast<float>(val);
    variance += x * x;
-  }
+  };
  vllm::vectorize_read_with_alignment<VEC_SIZE>(
      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
@ -414,7 +426,7 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                          wt_ptr % req_alignment_bytes == 0;
  bool offsets_are_multiple_of_vector_width =
      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
@ -462,7 +474,7 @@ void poly_norm(torch::Tensor& out,     // [..., hidden_size]
  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
    LAUNCH_FUSED_POLY_NORM(8);
  } else {
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -10,6 +10,7 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
 #include "quantization/vectorization_utils.cuh"
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
  __shared__ float s_variance;
  float variance = 0.0f;
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+  const scalar_t* input_row = input + blockIdx.x * input_stride;
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+
  constexpr int VEC_SIZE = 8;
  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
      float x = static_cast<float>(vec.val[i]);
      variance += x * x;
    }
  };
  auto scalar_op = [&variance](const scalar_t& val) {
    float x = static_cast<float>(val);
    variance += x * x;
-  }
+  };
  vllm::vectorize_read_with_alignment<VEC_SIZE>(
      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
@ -241,7 +254,7 @@ void fused_add_rms_norm_static_fp8_quant(
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
  bool ptrs_are_aligned =
      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -8,12 +8,77 @@
 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
 #include "core/math.hpp"
 #define CEILDIV(x, y) (((x) + (y) - 1) / (y))
 namespace vllm {
 namespace moe {
 namespace batched_moe_align_block_size {
 // Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
 static constexpr int32_t num_threads = 1024;
 static constexpr int32_t num_blocks = 1;
 __global__ void batched_moe_align_block_size_kernel(
    int32_t const num_batches, int32_t const max_tokens_per_batch,
    int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
    int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
    int32_t* __restrict__ num_tokens_post_pad) {
  // TODO(varun): This is a naive implementation. Could be optimized.
  size_t const batch_id = threadIdx.x;
  size_t const stride = blockDim.x * gridDim.x;
  int32_t const num_blocks_per_batch =
      CEILDIV(max_tokens_per_batch, block_size);
  int32_t const sorted_ids_size =
      num_blocks_per_batch * num_batches * block_size;
  int32_t const block_ids_size = sorted_ids_size / block_size;
  int32_t const SENTINEL =
      num_batches * max_tokens_per_batch;  // To denote invalid entries.
  // Intialize sorted_ids
  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
    sorted_ids[i] = SENTINEL;
  }
  // Intialize expert_ids with -1
  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
    block_ids[i] = -1;
  }
  int32_t b_num_tokens = 0;
  if (batch_id < num_batches) {
    b_num_tokens = batch_num_tokens[batch_id];
  }
  int32_t const ceil_b_num_tokens =
      CEILDIV(b_num_tokens, block_size) * block_size;
  // Compute prefix sum over token counts per expert
  using BlockScan = cub::BlockScan<int32_t, 1024>;
  __shared__ typename BlockScan::TempStorage temp_storage;
  int cumsum_val;
  BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
  __syncthreads();
  bool const is_last_batch = batch_id == (num_batches - 1);
  if (is_last_batch) {
    *num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
  }
  if (batch_id < num_batches) {
    int32_t const batch_offset = batch_id * max_tokens_per_batch;
    for (size_t i = 0; i < b_num_tokens; ++i) {
      sorted_ids[cumsum_val + i] = batch_offset + i;
    }
    int32_t const block_start = cumsum_val / block_size;
    int32_t const num_blocks = ceil_b_num_tokens / block_size;
    for (size_t i = 0; i < num_blocks; ++i) {
      block_ids[block_start + i] = batch_id;
    }
  }
 }
 }  // namespace batched_moe_align_block_size
 template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(
    const scalar_t* __restrict__ topk_ids,
@ -280,6 +345,33 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
      });
 }
 void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                  int64_t block_size,
                                  torch::Tensor const& batch_num_tokens,
                                  torch::Tensor sorted_ids,
                                  torch::Tensor batch_ids,
                                  torch::Tensor num_tokens_post_pad) {
  namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  int32_t const B = batch_num_tokens.size(0);
  int32_t const num_blocks_per_batch =
      round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
  int32_t const num_blocks = num_blocks_per_batch * B;
  int64_t const sorted_ids_size = num_blocks * block_size;
  TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
  TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
  TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
  TORCH_CHECK(B <= batched_kernel::num_threads);
  batched_kernel::batched_moe_align_block_size_kernel<<<
      batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
      B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
      sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
      num_tokens_post_pad.data_ptr<int32_t>());
 }
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
             torch::Tensor& output)  // [num_tokens, hidden_size]
 {
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -12,6 +12,14 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);
 void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                  int64_t block_size,
                                  torch::Tensor const& expert_num_tokens,
                                  torch::Tensor sorted_ids,
                                  torch::Tensor expert_ids,
                                  torch::Tensor num_tokens_post_pad);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor b_qweight, torch::Tensor b_scales,
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -21,7 +21,6 @@
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"
 #include "../core/batch_invariant.hpp"
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -406,8 +405,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
-    const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -22,6 +22,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
  // Aligning the number of tokens to be processed by each expert such
  // that it is divisible by the block size, but for the batched case.
  m.def(
      "batched_moe_align_block_size(int max_tokens_per_batch,"
      "                     int block_size, Tensor expert_num_tokens,"
      "                     Tensor! sorted_token_ids,"
      "                     Tensor! experts_ids,"
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("batched_moe_align_block_size", torch::kCUDA,
         &batched_moe_align_block_size);
 #ifndef USE_ROCM
  m.def(
      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
--- a/csrc/quickreduce/quick_reduce.h
+++ b/csrc/quickreduce/quick_reduce.h
@ -22,13 +22,14 @@ template <typename AllReduceKernel, typename T>
 __global__ __quickreduce_launch_bounds_two_shot__ static void
 allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
                            int rank, uint8_t** dbuffer_list,
-                            uint32_t data_offset, uint32_t flag_color) {
+                            uint32_t data_offset, uint32_t flag_color,
                            int64_t data_size_per_phase) {
  int block = blockIdx.x;
  int grid = gridDim.x;
  while (block < num_blocks) {
    AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
-                         flag_color);
+                         flag_color, data_size_per_phase);
    block += grid;
    flag_color++;
  }
@ -41,21 +42,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color);                                         \
+                       flag_color, this->kMaxProblemSize);                  \
  } else if (world_size == 4) {                                             \
    using LineCodec = __codec<T, 4>;                                        \
    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color);                                         \
+                       flag_color, this->kMaxProblemSize);                  \
  } else if (world_size == 8) {                                             \
    using LineCodec = __codec<T, 8>;                                        \
    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
-                       flag_color);                                         \
+                       flag_color, this->kMaxProblemSize);                  \
  }
 enum QuickReduceQuantLevel {
--- a/csrc/quickreduce/quick_reduce_impl.cuh
+++ b/csrc/quickreduce/quick_reduce_impl.cuh
@ -553,13 +553,12 @@ struct AllReduceTwoshot {
      int const rank,                      // rank index
      uint8_t** __restrict__ buffer_list,  // communication buffers
      uint32_t const data_offset,          // offset to start of the data buffer
-      uint32_t flag_color) {
+      uint32_t flag_color, int64_t data_size_per_phase) {
    // Topology
    int thread = threadIdx.x + threadIdx.y * kWavefront;
    uint8_t* rank_buffer = buffer_list[rank];
    Codec codec(thread, rank);
    int block_id = blockIdx.x;
    int grid_size = gridDim.x;
    // --------------------------------------------------------
    // Read input into registers
    int32x4_t tA[kAtoms];
@ -588,12 +587,10 @@ struct AllReduceTwoshot {
    // rank responsible for this segment.
    uint32_t comm_data0_offset =
        data_offset + block_id * Codec::kTransmittedTileSize;
-    uint32_t comm_data1_offset =
+    uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset;
        grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
    uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
-    uint32_t comm_flags1_offset =
+    uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset;
        grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
    for (int r = 0; r < kWorldSize; r++) {
      int32x4_t* send_buffer =
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=450
+ARG VLLM_MAX_SIZE_MB=500
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -359,8 +359,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.4.0 \
+    uv pip install --system flashinfer-cubin==0.4.1 \
-    && uv pip install --system flashinfer-jit-cache==0.4.0 \
+    && uv pip install --system flashinfer-jit-cache==0.4.1 \
        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
    && flashinfer show-config
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.4.0
+# release version: v0.4.1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
    echo "git clone flashinfer..." \
    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
-    && git checkout v0.4.0 \
+    && git checkout v0.4.1\
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
    && rm -rf build \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 RUN apt-get update -q -y && apt-get install -q -y \
    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
    apt-transport-https ca-certificates wget curl
-# Remove sccache    
+# Remove sccache
 RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 ARG COMMON_WORKDIR
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs.
 ```python
 from vllm import LLM
-llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
+llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
          tensor_parallel_size=2)
 ```
 !!! warning
@ -24,7 +23,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
 !!! note
    With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
-    You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+    You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 ## Quantization
@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option).
 ```python
 from vllm import LLM
-llm = LLM(model="adept/fuyu-8b",
+llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
          max_model_len=2048,
          max_num_seqs=2)
 ```
 ## Reduce CUDA Graphs
@ -61,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
    ```python
    from vllm import LLM
-    from vllm.config import CompilationConfig, CompilationLevel
+    from vllm.config import CompilationConfig, CompilationMode
    llm = LLM(
        model="meta-llama/Llama-3.1-8B-Instruct",
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            # By default, it goes up to max_num_seqs
            cudagraph_capture_sizes=[1, 2, 4, 8, 16],
        ),
@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag:
 ```python
 from vllm import LLM
-llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
          enforce_eager=True)
 ```
 ## Adjust cache size
@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
 from vllm import LLM
 # Accept up to 3 images and 1 video per prompt
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+llm = LLM(
-          limit_mm_per_prompt={"image": 3, "video": 1})
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
    limit_mm_per_prompt={"image": 3, "video": 1},
 )
 ```
 You can go a step further and disable unused modalities completely by setting its limit to zero.
@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a
 from vllm import LLM
 # Accept any number of images but no videos
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+llm = LLM(
-          limit_mm_per_prompt={"video": 0})
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
    limit_mm_per_prompt={"video": 0},
 )
 ```
 You can even run a multi-modal model for text-only inference:
@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference:
 from vllm import LLM
 # Don't accept images. Just text.
-llm = LLM(model="google/gemma-3-27b-it",
+llm = LLM(
-          limit_mm_per_prompt={"image": 0})
+    model="google/gemma-3-27b-it",
    limit_mm_per_prompt={"image": 0},
 )
 ```
 ### Configurable options
@ -173,14 +175,14 @@ Here are some examples:
 from vllm import LLM
 # Available for Qwen2-VL series models
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+llm = LLM(
-          mm_processor_kwargs={
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
-              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+    mm_processor_kwargs={"max_pixels": 768 * 768},  # Default is 1280 * 28 * 28
-          })
+)
 # Available for InternVL series models
-llm = LLM(model="OpenGVLab/InternVL2-2B",
+llm = LLM(
-          mm_processor_kwargs={
+    model="OpenGVLab/InternVL2-2B",
-              "max_dynamic_patch": 4,  # Default is 12
+    mm_processor_kwargs={"max_dynamic_patch": 4},  # Default is 12
-          })
+)
 ```
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -100,7 +100,7 @@ from vllm import LLM
 llm = LLM(
    model="meta-llama/Llama-3.3-70B-Instruct,
    tensor_parallel_size=4,
-    pipeline_parallel_size=2
+    pipeline_parallel_size=2,
 )
 ```
@ -174,14 +174,14 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 Known supported models (with corresponding benchmarks):
- dots_ocr (<gh-pr:25466>)
+- dots_ocr (<https://github.com/vllm-project/vllm/pull/25466>)
- GLM-4.1V or above (<gh-pr:23168>)
+- GLM-4.1V or above (<https://github.com/vllm-project/vllm/pull/23168>)
- InternVL (<gh-pr:23909>)
+- InternVL (<https://github.com/vllm-project/vllm/pull/23909>)
- Kimi-VL (<gh-pr:23817>)
+- Kimi-VL (<https://github.com/vllm-project/vllm/pull/23817>)
- Llama4 (<gh-pr:18368>)
+- Llama4 (<https://github.com/vllm-project/vllm/pull/18368>)
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
+- MiniCPM-V-2.5 or above (<https://github.com/vllm-project/vllm/pull/23327>, <https://github.com/vllm-project/vllm/pull/23948>)
- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
+- Qwen2-VL or above (<https://github.com/vllm-project/vllm/pull/22742>, <https://github.com/vllm-project/vllm/pull/24955>, <https://github.com/vllm-project/vllm/pull/25445>)
- Step3 (<gh-pr:22697>)
+- Step3 (<https://github.com/vllm-project/vllm/pull/22697>)
 ## Input Processing
@ -257,18 +257,24 @@ Examples:
 ```python
 # Use a larger cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+llm = LLM(
-          mm_processor_cache_gb=8)
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
    mm_processor_cache_gb=8,
 )
 # Use a shared-memory based IPC cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+llm = LLM(
-          tensor_parallel_size=2,
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_cache_type="shm",
+    tensor_parallel_size=2,
-          mm_processor_cache_gb=8)
+    mm_processor_cache_type="shm",
    mm_processor_cache_gb=8,
 )
 # Disable the cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+llm = LLM(
-          mm_processor_cache_gb=0)
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
    mm_processor_cache_gb=0,
 )
 ```
 ### Cache Placement
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ
 ### Tune your workloads
-Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
+Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
 ### Future Topics We'll Cover
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@ -22,7 +22,7 @@ Unsure on where to start? Check out the following links for tasks to work on:
 ## License
-See <gh-file:LICENSE>.
+See [LICENSE](../../LICENSE).
 ## Developing
@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware,
 For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
 !!! tip
-    vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
+    vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](../../docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
    Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
@ -88,7 +88,7 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
 ### Documentation
-MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
+MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, [mkdocs.yaml](../../mkdocs.yaml).
 Get started with:
@ -152,7 +152,7 @@ pytest -s -v tests/test_logger.py
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 !!! important
-    If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
+    If you discover a security vulnerability, please follow the instructions [here](../../SECURITY.md).
 ## Pull Requests & Code Reviews
@ -162,7 +162,7 @@ code quality and improve the efficiency of the review process.
 ### DCO and Signed-off-by
-When contributing changes to this project, you must agree to the <gh-file:DCO>.
+When contributing changes to this project, you must agree to the [DCO](../../DCO).
 Commits must include a `Signed-off-by:` header which certifies agreement with
 the terms of the DCO.
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -35,6 +35,7 @@ th {
 | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
 | Random | ✅ | ✅ | `synthetic` |
 | RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
 | RandomForReranking | ✅ | ✅ | `synthetic` |
 | Prefix Repetition | ✅ | ✅ | `synthetic` |
 | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
 | HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
@ -821,7 +822,7 @@ you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backen
 - CLIP: `--backend openai-embeddings-clip`
 - VLM2Vec: `--backend openai-embeddings-vlm2vec`
-For other models, please add your own implementation inside <gh-file:vllm/benchmarks/lib/endpoint_request_func.py> to match the expected instruction format.
+For other models, please add your own implementation inside [vllm/benchmarks/lib/endpoint_request_func.py](../../vllm/benchmarks/lib/endpoint_request_func.py) to match the expected instruction format.
 You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
 For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
@ -878,6 +879,51 @@ vllm bench serve \
 </details>
 #### Reranker Benchmark
 Benchmark the performance of rerank requests in vLLM.
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 Unlike generative models which use Completions API or Chat Completions API,
 you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API.
 For reranking, the only supported dataset is `--dataset-name random-rerank`
 Start the server:
 ```bash
 vllm serve BAAI/bge-reranker-v2-m3
 ```
 Run the benchmark:
 ```bash
 vllm bench serve \
  --model BAAI/bge-reranker-v2-m3 \
  --backend vllm-rerank \
  --endpoint /v1/rerank \
  --dataset-name random-rerank \
  --tokenizer BAAI/bge-reranker-v2-m3 \
  --random-input-len 512 \
  --num-prompts 10 \
  --random-batch-size 5
 ```
 For reranker models, this will create `num_prompts / random_batch_size` requests with
 `random_batch_size` "documents" where each one has close to `random_input_len` tokens.
 In the example above, this results in 2 rerank requests with 5 "documents" each where
 each document has close to 512 tokens.
 Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
 with an embedding model, also set `--no_reranker`. Because in this case the query is
 treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
 to account for the extra prompt which is the query. The token accounting to report the
 throughput numbers correctly is also adjusted.
 </details>
 [](){ #performance-benchmarks }
 ## Performance Benchmarks
@ -916,7 +962,7 @@ For more results visualization, check the [visualizing the results](https://gith
 The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
-More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
 ### Continuous Benchmarking
@ -950,4 +996,4 @@ These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lm
 The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
-More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
+More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md).
--- a/docs/contributing/ci/failures.md
+++ b/docs/contributing/ci/failures.md
@ -64,7 +64,7 @@ Download the full log file from Buildkite locally.
 Strip timestamps and colorization:
-<gh-file:.buildkite/scripts/ci-clean-log.sh>
+[.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh)
 ```bash
 ./ci-clean-log.sh ci.log
@ -87,7 +87,7 @@ tail -525 ci_build.log | wl-copy
 CI test failures may be flaky. Use a bash loop to run repeatedly:
-<gh-file:.buildkite/scripts/rerun-test.sh>
+[.buildkite/scripts/rerun-test.sh](../../../.buildkite/scripts/rerun-test.sh)
 ```bash
 ./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@ -5,7 +5,7 @@ release in CI/CD. It is standard practice to submit a PR to update the
 PyTorch version as early as possible when a new [PyTorch stable
 release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
 This process is non-trivial due to the gap between PyTorch
-releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
+releases. Using <https://github.com/vllm-project/vllm/pull/16859> as an example, this document outlines common steps to achieve this
 update along with a list of potential issues and how to address them.
 ## Test PyTorch release candidates (RCs)
@ -85,7 +85,7 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
 it doesn't populate the cache, so re-running it to warm up the cache
 is ineffective.
-While ongoing efforts like [#17419](gh-issue:17419)
+While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
 address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
 to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
 when manually triggering a build on Buildkite. This branch accomplishes two things:
@ -138,5 +138,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
 for different platforms in vLLM CI/CD allows us to selectively choose
 which platforms to update. For instance, updating XPU requires the corresponding
 release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
-While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
+While <https://github.com/vllm-project/vllm/pull/16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
-<gh-pr:17444> completed the update for XPU.
+<https://github.com/vllm-project/vllm/pull/17444> completed the update for XPU.
--- a/docs/contributing/dockerfile/dockerfile.md
+++ b/docs/contributing/dockerfile/dockerfile.md
@ -1,6 +1,6 @@
 # Dockerfile
-We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+We provide a [docker/Dockerfile](../../../docker/Dockerfile) to construct the image for running an OpenAI compatible server with vLLM.
 More information about deploying with Docker can be found [here](../../deployment/docker.md).
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -5,7 +5,7 @@ This guide walks you through the steps to implement a basic vLLM model.
 ## 1. Bring your model code
 First, clone the PyTorch model code from the source repository.
-For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
+For instance, vLLM's [OPT model](../../../vllm/model_executor/models/opt.py) was adapted from
 HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
 !!! warning
@ -73,8 +73,8 @@ def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
-    intermediate_tensors: Optional[IntermediateTensors] = None,
+    intermediate_tensors: IntermediateTensors | None = None,
-    inputs_embeds: Optional[torch.Tensor] = None,
+    inputs_embeds: torch.Tensor | None = None,
 ) -> torch.Tensor:
    ...
 ```
@ -83,7 +83,7 @@ def forward(
    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
    If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
+For reference, check out our [Llama implementation](../../../vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out [vllm/model_executor/models](../../../vllm/model_executor/models) for more examples.
 ## 3. (Optional) Implement tensor parallelism and quantization support
@ -130,22 +130,22 @@ We consider 3 different scenarios:
 2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
 3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
-For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
+For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
 The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
-For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
+For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
 Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
 V0-only classes and code will be removed in the very near future.
-The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
+The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
-For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
+For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
 These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
-For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
+For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
 Please follow the same guidelines as case (2) for implementing these models.
 We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
-Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
+Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
-Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
+Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
-The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
+The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -16,7 +16,7 @@ Further update the model as follows:
            ...
            @classmethod
-            def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+            def get_placeholder_str(cls, modality: str, i: int) -> str | None:
                if modality.startswith("image"):
                    return "<image>"
@ -45,14 +45,14 @@ Further update the model as follows:
            ...
            def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
                assert self.vision_encoder is not None
                image_features = self.vision_encoder(image_input)
                return self.multi_modal_projector(image_features)
            def get_multimodal_embeddings(
-                    self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+                self,
-
+                **kwargs: object,
            ) -> MultiModalEmbeddings | None:
                # Validate the multimodal input keyword arguments
                image_input = self._parse_and_validate_image_input(**kwargs)
                if image_input is None:
@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
 For example, if the model supports any number of images but only one video per prompt:
 ```python
-def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+def get_supported_mm_limits(self) -> Mapping[str, int | None]:
    return {"image": None, "video": 1}
 ```
@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
            self,
            seq_len: int,
            mm_counts: Mapping[str, int],
-            mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+            mm_options: Mapping[str, BaseDummyOptions] | None = None,
        ) -> MultiModalDataDict:
            num_images = mm_counts.get("image", 0)
@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
    ```python
    def get_image_size_with_most_features(self) -> ImageSize:
        image_processor = self.get_image_processor()
-        return ImageSize(width=image_processor.size["width"],
+        return ImageSize(
-                            height=image_processor.size["height"])
+            width=image_processor.size["width"],
            height=image_processor.size["height"],
        )
    ```
    Fuyu does not expect image placeholders in the inputs to HF processor, so
@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
            return {
                "image":
-                self._get_dummy_images(width=target_width,
+                self._get_dummy_images(
-                                    height=target_height,
+                    width=target_width,
-                                    num_images=num_images,
+                    height=target_height,
-                                    overrides=image_overrides)
+                    num_images=num_images,
                    overrides=image_overrides,
                )
            }
        ```
@ -503,7 +507,7 @@ return a schema of the tensors outputted by the HF processor that are related to
    ```
    !!! note
-        Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
+        Our [actual code](../../../vllm/model_executor/models/llava.py) additionally supports
        pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
 === "With postprocessing: Fuyu"
@ -565,7 +569,7 @@ return a schema of the tensors outputted by the HF processor that are related to
        ```
    !!! note
-        Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
+        Our [actual code](../../../vllm/model_executor/models/fuyu.py) has special handling
        for text-only inputs to prevent unnecessary warnings from HF processor.
    !!! note
@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                image_width=image_size.width,
                image_height=image_size.height,
            )
-            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+            image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
                            [_NEWLINE_TOKEN_ID]) * nrows
            return PromptUpdateDetails.select_token_id(
                image_tokens + [bos_token_id],
@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                    image_width=image_size.width,
                    image_height=image_size.height,
                )
-                image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
                                [_NEWLINE_TOKEN_ID]) * nrows
                return PromptUpdateDetails.select_token_id(
                    image_tokens + [bos_token_id],
@ -810,9 +812,11 @@ to register them to the multi-modal registry:
  from vllm.model_executor.models.interfaces import SupportsMultiModal
 + from vllm.multimodal import MULTIMODAL_REGISTRY
-+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
+ @MULTIMODAL_REGISTRY.register_processor(
-+                                         info=YourProcessingInfo,
+     YourMultiModalProcessor,
-+                                         dummy_inputs=YourDummyInputsBuilder)
+     info=YourProcessingInfo,
 +     dummy_inputs=YourDummyInputsBuilder,
 + )
  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
@ -824,8 +828,8 @@ Some HF processors directly insert feature tokens without replacing anything in
 Examples:
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
+- BLIP-2 (insert at start of prompt): [vllm/model_executor/models/blip2.py](../../../vllm/model_executor/models/blip2.py)
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
+- Molmo (insert after `<|endoftext|>` token): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
 ### Handling prompt updates unrelated to multi-modal data
@ -833,9 +837,9 @@ Examples:
 Examples:
- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
+- Chameleon (appends `sep_token`): [vllm/model_executor/models/chameleon.py](../../../vllm/model_executor/models/chameleon.py)
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
+- Fuyu (appends `boa_token`): [vllm/model_executor/models/fuyu.py](../../../vllm/model_executor/models/fuyu.py)
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
+- Molmo (applies chat template which is not defined elsewhere): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
 ### Custom HF processor
@ -843,6 +847,6 @@ Some models don't define an HF processor class on HF Hub. In that case, you can
 Examples:
- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
+- DeepSeek-VL2: [vllm/model_executor/models/deepseek_vl2.py](../../../vllm/model_executor/models/deepseek_vl2.py)
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
+- InternVL: [vllm/model_executor/models/internvl.py](../../../vllm/model_executor/models/internvl.py)
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
+- Qwen-VL: [vllm/model_executor/models/qwen_vl.py](../../../vllm/model_executor/models/qwen_vl.py)
--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@ -11,8 +11,8 @@ This page provides detailed instructions on how to do so.
 To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
 This gives you the ability to modify the codebase and test your model.
-After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
+After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory.
-Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
+Then, add your model class to `_VLLM_MODELS` in [vllm/model_executor/models/registry.py](../../../vllm/model_executor/models/registry.py) so that it is automatically registered upon importing vLLM.
 Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
 !!! important
@ -42,7 +42,7 @@ def register():
    ModelRegistry.register_model(
        "YourModelForCausalLM",
-        "your_code:YourModelForCausalLM"
+        "your_code:YourModelForCausalLM",
    )
 ```
--- a/docs/contributing/model/tests.md
+++ b/docs/contributing/model/tests.md
@ -9,7 +9,7 @@ Without them, the CI for your PR will fail.
 ### Model loading
-Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
+Include an example HuggingFace repository for your model in [tests/models/registry.py](../../../tests/models/registry.py).
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
 !!! important
@ -26,18 +26,18 @@ Passing these tests provides more confidence that your implementation is correct
 ### Model correctness
-These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
+These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of [tests/models](../../../tests/models).
 #### Generative models
-For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
+For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in [tests/models/utils.py](../../../tests/models/utils.py):
 - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
 - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
 #### Pooling models
-For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
+For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
 [](){ #mm-processing-tests }
@ -45,7 +45,7 @@ For [pooling models](../../models/pooling_models.md), we simply check the cosine
 #### Common tests
-Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
+Adding your model to [tests/models/multimodal/processing/test_common.py](../../../tests/models/multimodal/processing/test_common.py) verifies that the following input combinations result in the same outputs:
 - Text + multi-modal data
 - Tokens + multi-modal data
@ -54,6 +54,6 @@ Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py>
 #### Model-specific tests
-You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
+You can add a new file under [tests/models/multimodal/processing](../../../tests/models/multimodal/processing) to run tests that only apply to your model.
-For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
+For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in [tests/models/multimodal/processing/test_phi3v.py](../../../tests/models/multimodal/processing/test_phi3v.py).
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@ -15,6 +15,7 @@ Declare supported languages and capabilities:
 - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
 ??? code "supported_languages and supports_transcription_only"
    ```python
    from typing import ClassVar, Mapping, Literal
    import numpy as np
@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
 This is for controlling general behavior of the API when serving your model:
 ??? code "get_speech_to_text_config()"
    ```python
    class YourASRModel(nn.Module, SupportsTranscription):
        ...
@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
 Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
 ??? code "get_generation_prompt()"
    ```python
    class YourASRModel(nn.Module, SupportsTranscription):
        ...
@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
 Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
 ??? code "get_generation_prompt()"
    ```python
    class YourASRModel(nn.Module, SupportsTranscription):
        ...
@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface
 If your model requires a language and you want a default, override this method (see Whisper):
 ??? code "validate_language()"
    ```python
    @classmethod
    def validate_language(cls, language: str | None) -> str | None:
        if language is None:
            logger.warning(
-                "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
+                "Defaulting to language='en'. If you wish to transcribe "
                "audio in a different language, pass the `language` field "
                "in the TranscriptionRequest."
            )
            language = "en"
        return super().validate_language(language)
    ```
@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
 Provide a fast duration→token estimate to improve streaming usage statistics:
 ??? code "get_num_audio_tokens()"
    ```python
    class YourASRModel(nn.Module, SupportsTranscription):
        ...
@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi
 Relevant server logic:
 ??? code "_preprocess_speech_to_text()"
    ```python
    # vllm/entrypoints/openai/speech_to_text.py
    async def _preprocess_speech_to_text(...):
@ -238,9 +248,9 @@ No extra registration is required beyond having your model class available via t
 ## Examples in-tree
- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
+- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
+- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
+- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
 ## Test with the API
@ -268,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
      http://localhost:8000/v1/audio/translations
    ```
-Or check out more examples in <gh-file:examples/online_serving>.
+Or check out more examples in [examples/online_serving](../../../examples/online_serving).
 !!! note
    - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -33,7 +33,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
 #### Offline Inference
-Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
+Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example.
 #### OpenAI Server
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@ -10,7 +10,7 @@ The image can be used to run OpenAI compatible server and is available on Docker
 ```bash
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    --env "HF_TOKEN=$HF_TOKEN" \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
@ -22,7 +22,7 @@ This image can also be used with other container engines such as [Podman](https:
 ```bash
 podman run --device nvidia.com/gpu=all \
  -v ~/.cache/huggingface:/root/.cache/huggingface \
-  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  --env "HF_TOKEN=$HF_TOKEN" \
  -p 8000:8000 \
  --ipc=host \
  docker.io/vllm/vllm-openai:latest \
@ -37,7 +37,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
    memory to share data between processes under the hood, particularly for tensor parallel inference.
 !!! note
-    Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
+    Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
    If you need to use those dependencies (having accepted the license terms),
    create a custom Dockerfile on top of the base image with an extra layer that installs them:
@ -66,7 +66,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
 ## Building vLLM's Docker Image from Source
-You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
+You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:
 ```bash
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
@ -128,7 +128,7 @@ To run vLLM with the custom-built Docker image:
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -p 8000:8000 \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    --env "HF_TOKEN=<secret>" \
    vllm/vllm-openai <args...>
 ```
--- a/docs/deployment/frameworks/anyscale.md
+++ b/docs/deployment/frameworks/anyscale.md
@ -5,7 +5,7 @@
 [Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
 Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
-without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
+without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh).
 When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
 ??? console "Command"
-    ```python
+    ```bash
    curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
    -H 'Content-Type: application/json' \
    -H 'Authorization: <JWT TOKEN>' \
@ -81,7 +81,7 @@ You should get a response like:
 ??? console "Response"
-    ```python
+    ```json
    {
        "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
        "result": {
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
    client = OpenAI(
        base_url="https://gateway.<gateway domain>",
-        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
+        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
    )
    completion = client.chat.completions.create(
@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
                "role": "user",
                "content": "Compose a poem that explains the concept of recursion in programming.",
            }
-        ]
+        ],
    )
    print(completion.choices[0].message.content)
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@ -34,7 +34,7 @@ pip install vllm haystack-ai
        api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
        model="mistralai/Mistral-7B-Instruct-v0.1",
        api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
-        generation_kwargs = {"max_tokens": 512}
+        generation_kwargs={"max_tokens": 512},
    )
    response = generator.run(
--- a/docs/deployment/frameworks/hf_inference_endpoints.md
+++ b/docs/deployment/frameworks/hf_inference_endpoints.md
@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
    import os
    client = OpenAI(
-        base_url = DEPLOYMENT_URL,
+        base_url=DEPLOYMENT_URL,
-        api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
+        api_key=os.environ["HF_TOKEN"],  # https://huggingface.co/settings/tokens
    )
    chat_completion = client.chat.completions.create(
-        model = "HuggingFaceTB/SmolLM3-3B",
+        model="HuggingFaceTB/SmolLM3-3B",
-        messages = [
+        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
-                        "text": "Give me a brief explanation of gravity in simple terms."
+                        "text": "Give me a brief explanation of gravity in simple terms.",
                    }
-                ]
+                ],
            }
        ],
-        stream = True
+        stream=True,
    )
    for message in chat_completion:
-        print(message.choices[0].delta.content, end = "")
+        print(message.choices[0].delta.content, end="")
    ```
 !!! note
@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
    import os
    client = OpenAI(
-        base_url = DEPLOYMENT_URL,
+        base_url=DEPLOYMENT_URL,
-        api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
+        api_key=os.environ["HF_TOKEN"],  # https://huggingface.co/settings/tokens
    )
    chat_completion = client.chat.completions.create(
-        model = "ibm-granite/granite-docling-258M",
+        model="ibm-granite/granite-docling-258M",
-        messages = [
+        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
-                            "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
+                            "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png",
-                        }
+                        },
                    },
                    {
                        "type": "text",
-                        "text": "Convert this page to docling."
+                        "text": "Convert this page to docling.",
-                    }
+                    },
                ]
            }
        ],
-        stream = True
+        stream=True,
    )
    for message in chat_completion:
-        print(message.choices[0].delta.content, end = "")
+        print(message.choices[0].delta.content, end="")
    ```
 !!! note
--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@ -36,15 +36,16 @@ pip install vllm litellm
    ```python
    import litellm 
-    messages = [{ "content": "Hello, how are you?","role": "user"}]
+    messages = [{"content": "Hello, how are you?", "role": "user"}]
    # hosted_vllm is prefix key word and necessary
    response = litellm.completion(
-                model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
+        model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
-                messages=messages,
+        messages=messages,
-                api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
+        api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
-                temperature=0.2,
+        temperature=0.2,
-                max_tokens=80)
+        max_tokens=80,
    )
    print(response)
    ```
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@ -35,7 +35,7 @@ Deploy the following yaml file `lws.yaml`
              - name: vllm-leader
                image: docker.io/vllm/vllm-openai:latest
                env:
-                  - name: HUGGING_FACE_HUB_TOKEN
+                  - name: HF_TOKEN
                    value: <your-hf-token>
                command:
                  - sh
@ -83,7 +83,7 @@ Deploy the following yaml file `lws.yaml`
                    ephemeral-storage: 800Gi
                    cpu: 125
                env:
-                  - name: HUGGING_FACE_HUB_TOKEN
+                  - name: HF_TOKEN
                    value: <your-hf-token>
                volumeMounts:
                  - mountPath: /dev/shm
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@ -36,11 +36,11 @@ pip install -U vllm \
    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
    ```
-1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
+1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
 1. Run the script
-    ```python
+    ```bash
    python retrieval_augmented_generation_with_langchain.py
    ```
@ -74,10 +74,10 @@ pip install vllm \
    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
    ```
-1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
+1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
 1. Run the script:
-    ```python
+    ```bash
    python retrieval_augmented_generation_with_llamaindex.py
    ```
--- a/docs/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@ -20,7 +20,7 @@ pip install vllm streamlit openai
    vllm serve Qwen/Qwen1.5-0.5B-Chat
    ```
-1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
+1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
 1. Start the streamlit web UI and start to chat:
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@ -82,7 +82,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
              "vllm serve meta-llama/Llama-3.2-1B-Instruct"
            ]
            env:
-            - name: HUGGING_FACE_HUB_TOKEN
+            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
@ -209,7 +209,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
              ]
              env:
-              - name: HUGGING_FACE_HUB_TOKEN
+              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
@ -298,7 +298,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
              ]
              env:
-              - name: HUGGING_FACE_HUB_TOKEN
+              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@ -49,7 +49,7 @@ Here is a sample of `LLM` class usage:
 More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
-The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
+The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](../../vllm/entrypoints/llm.py).
 ### OpenAI-Compatible API Server
@ -60,7 +60,7 @@ This server can be started using the `vllm serve` command.
 vllm serve <model>
 ```
-The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
+The code for the `vllm` CLI can be found in [vllm/entrypoints/cli/main.py](../../vllm/entrypoints/cli/main.py).
 Sometimes you may see the API server entrypoint used directly instead of via the
 `vllm` CLI command. For example:
@ -74,7 +74,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
    `python -m vllm.entrypoints.openai.api_server` is deprecated
    and may become unsupported in a future release.
-That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
+That code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/entrypoints/openai/api_server.py).
 More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
@ -101,7 +101,7 @@ processing.
 - **Output Processing**: Processes the outputs generated by the model, decoding the
  token IDs from a language model into human-readable text.
-The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
+The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py](../../vllm/engine/llm_engine.py).
 ### AsyncLLMEngine
@ -111,9 +111,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
 can handle multiple concurrent requests and stream outputs to clients.
 The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
-API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
+API server that serves as a simpler example in [vllm/entrypoints/api_server.py](../../vllm/entrypoints/api_server.py).
-The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
+The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py](../../vllm/engine/async_llm_engine.py).
 ## Worker
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@ -17,7 +17,7 @@ In this document we will discuss the:
    In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches).
 !!! note
-    The following contents are mostly based on the last commit of <gh-pr:20059>.
+    The following contents are mostly based on the last commit of <https://github.com/vllm-project/vllm/pull/20059>.
 ## Motivation
@ -92,7 +92,7 @@ where `num_tokens` can be the padded token length, and `uniform_decode` is deter
 The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
 !!! note
-    The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<gh-pr:23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
+    The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
 ### `CudagraphDispatcher`
@ -106,9 +106,11 @@ The dispatch code looks like:
 batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
 runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
 # execution
-with set_forward_context(..., 
+with set_forward_context(
-            cudagraph_runtime_mode=runtime_mode, 
+    ..., 
-            batch_descriptor=batch_descriptor):
+    cudagraph_runtime_mode=runtime_mode, 
    batch_descriptor=batch_descriptor,
 ):
     output = self.model(...)
 ```
@ -165,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
    """NO CUDA Graphs support"""
 ```
-Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
+Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
 The following table lists backends that support full CUDA Graphs at the time of writing.
@ -200,12 +202,12 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
 import vllm
 from vllm.config import CUDAGraphMode
-compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
+compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
 model = vllm.LLM(
-            model="meta-llama/Llama-3.1-8B-Instruct",
+    model="meta-llama/Llama-3.1-8B-Instruct",
-            dtype='auto',
+    dtype="auto",
-            compilation_config = compilation_config,
+    compilation_config=compilation_config,
-        )
+)
 sampling_params = vllm.SamplingParams(
    temperature=0,  # greedy decoding
    max_tokens=1024,
--- a/docs/design/dbo.md
+++ b/docs/design/dbo.md
@ -34,10 +34,10 @@ To enable the DBO system pass in the `--enable-dbo` argument to your vllm serve
 * `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch
 * `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch
-Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `VLLM_ALL2ALL_BACKEND` environment variable must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
+Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `--all2all-backend` argument must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
 Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled.
-EX: `VLLM_ALL2ALL_BACKEND=deepep_low_latency vllm serve --model="deepseek-ai/DeepSeek-V2-Lite" --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo`
+EX: `vllm serve deepseek-ai/DeepSeek-V2-Lite --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo --all2all-backend deepep_low_latency`
 Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES`
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@ -2,7 +2,7 @@
 ## Introduction
-FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py)
+FusedMoEModularKernel is implemented [here](../..//vllm/model_executor/layers/fused_moe/modular_kernel.py)
 Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
@ -44,7 +44,7 @@ FusedMoEModularKernel splits the FusedMoE operation into 3 parts,
 The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
-Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
+Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
 `FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
 The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C
 #### Step 1: Add an All2All manager
-The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
+The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](../../vllm/distributed/device_communicators/all2all.py).
 #### Step 2: Add a FusedMoEPrepareAndFinalize Type
@ -213,29 +213,29 @@ Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vl
 ### How To Unit Test
-We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py).
+We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py).
 The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
 compatible, runs some correctness tests.
 If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
-1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](gh-file:tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
+1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](../../tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
 2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
 `Config::is_fe_16bit_supported()`,  `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
-`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](gh-file:tests/kernels/moe/modular_kernel_tools/common.py)
+`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py)
 Doing this will add the new implementation to the test suite.
 ### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
-The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
+The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
 Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
 As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
 with incompatible types, the script will error.
 ### How To Profile
-Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
+Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
 The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
 `FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
 Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@ -6,11 +6,11 @@ When performing an inference with IO Processor plugins, the prompt type is defin
 ## Writing an IO Processor Plugin
-IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
+IO Processor plugins implement the [`IOProcessor`][vllm.plugins.io_processors.interface.IOProcessor] interface:
 ```python
-IOProcessorInput = TypeVar('IOProcessorInput')
+IOProcessorInput = TypeVar("IOProcessorInput")
-IOProcessorOutput = TypeVar('IOProcessorOutput')
+IOProcessorOutput = TypeVar("IOProcessorOutput")
 class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
@ -21,30 +21,32 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
    def pre_process(
        self,
        prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
        **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
        raise NotImplementedError
    async def pre_process_async(
        self,
        prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
        **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
        return self.pre_process(prompt, request_id, **kwargs)
    @abstractmethod
-    def post_process(self,
+    def post_process(
-                     model_output: Sequence[PoolingRequestOutput],
+        self,
-                     request_id: Optional[str] = None,
+        model_output: Sequence[PoolingRequestOutput],
-                     **kwargs) -> IOProcessorOutput:
+        request_id: str | None = None,
        **kwargs,
    ) -> IOProcessorOutput:
        raise NotImplementedError
    async def post_process_async(
        self,
        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
        **kwargs,
    ) -> IOProcessorOutput:
        collected_output = [item async for i, item in model_output]
@ -56,7 +58,8 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
    @abstractmethod
    def output_to_response(
-            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        self, plugin_output: IOProcessorOutput
    ) -> IOProcessorResponse:
        raise NotImplementedError
 ```
@ -64,9 +67,9 @@ The `parse_request` method is used for validating the user prompt and converting
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.
 ## Using an IO Processor plugin
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -80,13 +80,13 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
 - `vllm:request_decode_time_seconds` - Requests decode time.
 - `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.
-See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
+See [the PR which added this Dashboard](https://github.com/vllm-project/vllm/pull/2316) for interesting and useful background on the choices made here.
 ### Prometheus Client Library
-Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
+Prometheus support was initially added [using the aioprometheus library](https://github.com/vllm-project/vllm/pull/1890), but a switch was made quickly to [prometheus_client](https://github.com/vllm-project/vllm/pull/2730). The rationale is discussed in both linked PRs.
-With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
+With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](https://github.com/vllm-project/vllm/pull/15657):
 ```bash
 $ curl http://0.0.0.0:8000/metrics 2>/dev/null  | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
@ -99,7 +99,7 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
 ### Multi-process Mode
-In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
+In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <https://github.com/vllm-project/vllm/pull/7279>.
 ### Built in Python/Process Metrics
@ -125,32 +125,32 @@ vLLM instance.
 For background, these are some of the relevant PRs which added the v0 metrics:
- <gh-pr:1890>
+- <https://github.com/vllm-project/vllm/pull/1890>
- <gh-pr:2316>
+- <https://github.com/vllm-project/vllm/pull/2316>
- <gh-pr:2730>
+- <https://github.com/vllm-project/vllm/pull/2730>
- <gh-pr:4464>
+- <https://github.com/vllm-project/vllm/pull/4464>
- <gh-pr:7279>
+- <https://github.com/vllm-project/vllm/pull/7279>
-Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
+Also note the ["Even Better Observability"](https://github.com/vllm-project/vllm/issues/3616) feature where e.g. [a detailed roadmap was laid out](https://github.com/vllm-project/vllm/issues/3616#issuecomment-2030858781).
 ## v1 Design
 ### v1 PRs
 For background, here are the relevant v1 PRs relating to the v1
-metrics issue <gh-issue:10582>:
+metrics issue <https://github.com/vllm-project/vllm/issues/10582>:
- <gh-pr:11962>
+- <https://github.com/vllm-project/vllm/pull/11962>
- <gh-pr:11973>
+- <https://github.com/vllm-project/vllm/pull/11973>
- <gh-pr:10907>
+- <https://github.com/vllm-project/vllm/pull/10907>
- <gh-pr:12416>
+- <https://github.com/vllm-project/vllm/pull/12416>
- <gh-pr:12478>
+- <https://github.com/vllm-project/vllm/pull/12478>
- <gh-pr:12516>
+- <https://github.com/vllm-project/vllm/pull/12516>
- <gh-pr:12530>
+- <https://github.com/vllm-project/vllm/pull/12530>
- <gh-pr:12561>
+- <https://github.com/vllm-project/vllm/pull/12561>
- <gh-pr:12579>
+- <https://github.com/vllm-project/vllm/pull/12579>
- <gh-pr:12592>
+- <https://github.com/vllm-project/vllm/pull/12592>
- <gh-pr:12644>
+- <https://github.com/vllm-project/vllm/pull/12644>
 ### Metrics Collection
@ -369,7 +369,7 @@ vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="F
 However, `prometheus_client` has
 [never supported Info metrics in multiprocessing mode](https://github.com/prometheus/client_python/pull/300) -
-for [unclear reasons](gh-pr:7279#discussion_r1710417152). We
+for [unclear reasons](https://github.com/vllm-project/vllm/pull/7279#discussion_r1710417152). We
 simply use a `Gauge` metric set to 1 and
 `multiprocess_mode="mostrecent"` instead.
@ -394,7 +394,7 @@ distinguish between per-adapter counts. This should be revisited.
 Note that `multiprocess_mode="livemostrecent"` is used - the most
 recent metric is used, but only from currently running processes.
-This was added in <gh-pr:9477> and there is
+This was added in <https://github.com/vllm-project/vllm/pull/9477> and there is
 [at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
 If we revisit this design and deprecate the old metric, we should reduce
 the need for a significant deprecation period by making the change in
@ -402,7 +402,7 @@ v0 also and asking this project to move to the new metric.
 ### Prefix Cache metrics
-The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
+The discussion in <https://github.com/vllm-project/vllm/issues/10582> about adding prefix cache metrics yielded
 some interesting points which may be relevant to how we approach
 future metrics.
@ -439,8 +439,8 @@ suddenly (from their perspective) when it is removed, even if there is
 an equivalent metric for them to use.
 As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
-[deprecated](gh-pr:2764) (with a comment in the code),
+[deprecated](https://github.com/vllm-project/vllm/pull/2764) (with a comment in the code),
-[removed](gh-pr:12383), and then [noticed by a user](gh-issue:13218).
+[removed](https://github.com/vllm-project/vllm/pull/12383), and then [noticed by a user](https://github.com/vllm-project/vllm/issues/13218).
 In general:
@ -460,33 +460,35 @@ the project-wide deprecation policy.
 ### Unimplemented - `vllm:tokens_total`
-Added by <gh-pr:4464>, but apparently never implemented. This can just be
+Added by <https://github.com/vllm-project/vllm/pull/4464>, but apparently never implemented. This can just be
 removed.
 ### Duplicated - Queue Time
 The `vllm:time_in_queue_requests` Histogram metric was added by
-<gh-pr:9659> and its calculation is:
+<https://github.com/vllm-project/vllm/pull/9659> and its calculation is:
 ```python
    self.metrics.first_scheduled_time = now
    self.metrics.time_in_queue = now - self.metrics.arrival_time
 ```
-Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
+Two weeks later, <https://github.com/vllm-project/vllm/pull/4464> added `vllm:request_queue_time_seconds` leaving
 us with:
 ```python
 if seq_group.is_finished():
-    if (seq_group.metrics.first_scheduled_time is not None and
+    if (
-            seq_group.metrics.first_token_time is not None):
+        seq_group.metrics.first_scheduled_time is not None
        and seq_group.metrics.first_token_time is not None
    ):
        time_queue_requests.append(
            seq_group.metrics.first_scheduled_time -
-            seq_group.metrics.arrival_time)
+            seq_group.metrics.arrival_time
        )
    ...
    if seq_group.metrics.time_in_queue is not None:
-        time_in_queue_requests.append(
+        time_in_queue_requests.append(seq_group.metrics.time_in_queue)
            seq_group.metrics.time_in_queue)
 ```
 This seems duplicative, and one of them should be removed. The latter
@ -511,7 +513,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
 memory. This is also known as "KV cache offloading" and is configured
 with `--swap-space` and `--preemption-mode`.
-In v0, [vLLM has long supported beam search](gh-issue:6226). The
+In v0, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
 SequenceGroup encapsulated the idea of N Sequences which
 all shared the same prompt kv blocks. This enabled KV cache block
 sharing between requests, and copy-on-write to do branching. CPU
@ -524,7 +526,7 @@ and the part of the prompt that was evicted can be recomputed.
 SequenceGroup was removed in V1, although a replacement will be
 required for "parallel sampling" (`n>1`).
-[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
+[Beam search was moved out of the core (in V0)](https://github.com/vllm-project/vllm/issues/8306). There was a
 lot of complex code for a very uncommon feature.
 In V1, with prefix caching being better (zero over head) and therefore
@ -539,7 +541,7 @@ Some v0 metrics are only relevant in the context of "parallel
 sampling". This is where the `n` parameter in a request is used to
 request multiple completions from the same prompt.
-As part of adding parallel sampling support in <gh-pr:10980>, we should
+As part of adding parallel sampling support in <https://github.com/vllm-project/vllm/pull/10980>, we should
 also add these metrics.
 - `vllm:request_params_n` (Histogram)
@ -564,7 +566,7 @@ model and then validate those tokens with the larger model.
 - `vllm:spec_decode_num_draft_tokens_total` (Counter)
 - `vllm:spec_decode_num_emitted_tokens_total` (Counter)
-There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
+There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
 speculative decoding to v1. Other techniques will follow. We should
 revisit the v0 metrics in this context.
@ -585,7 +587,7 @@ see:
 - [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
 - [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
 - [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
- <gh-issue:5041> and <gh-pr:12726>.
+- <https://github.com/vllm-project/vllm/issues/5041> and <https://github.com/vllm-project/vllm/pull/12726>.
 This is a non-trivial topic. Consider this comment from Rob:
@ -652,7 +654,7 @@ fall under the more general heading of "Observability".
 v0 has support for OpenTelemetry tracing:
- Added by <gh-pr:4687>
+- Added by <https://github.com/vllm-project/vllm/pull/4687>
 - Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
 - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
 - [User-facing docs](../examples/online_serving/opentelemetry.md)
@ -683,7 +685,7 @@ documentation for this option states:
 > use of possibly costly and or blocking operations and hence might
 > have a performance impact.
-The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
+The metrics were added by <https://github.com/vllm-project/vllm/pull/7089> and who up in an OpenTelemetry trace
 as:
 ```text
--- a/docs/design/mm_processing.md
+++ b/docs/design/mm_processing.md
@ -60,7 +60,7 @@ With the help of dummy text and automatic prompt updating, our multi-modal proce
 ## Processor Output Caching
-Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
+Some HF processors, such as the one for Qwen2-VL, are [very slow](https://github.com/vllm-project/vllm/issues/9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
 When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@ -92,8 +92,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
 | flashinfer                   | standard              | nvfp4,</br>fp8   | T             | <sup>5</sup>                                                | N                     | Y       | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts]                                                                            |
 | gpt oss triton               | standard              | N/A              | N/A           | <sup>5</sup>                                                | Y                     | Y       | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts]                                                                    |
 | deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup>  | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts]                                                 |
-| marlin                       | standard              | <sup>3</sup>     | <sup>3</sup>  | silu,</br>swigluoai                                         | Y                     | N       | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe]                                                                                                                                                                                                                |
+| marlin                       | standard              | <sup>3</sup>     | <sup>3</sup>  | silu,</br>swigluoai                                         | Y                     | Y       | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]          |
-| marlin experts               | standard              | N/A              | N/A           | silu,</br>swigluoai                                         | Y                     | Y       | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts]                                                                                                                                                                                                                      |
+| marlin experts               | standard,</br>batched | N/A              | N/A           | silu,</br>swigluoai                                         | Y                     | Y       | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]                                                                                                            |
 | trtllm                       | standard              | mxfp4,</br>nvfp4 | G(16),G(32)   | <sup>5</sup>                                                | N                     | Y       | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts]                                                                                                                                                                                                                      |
 | pallas                       | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe]                                                                                                                                                                                                                                    |
 | iterative                    | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe]                                                                                                                                                                                                                           |
@ -115,6 +115,6 @@ The following table shows "families" of modular kernels that are intended to wor
 | backend                          | `FusedMoEPrepareAndFinalize` subclasses                    | `FusedMoEPermuteExpertsUnpermute` subclasses                                                                               |
 |----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
-| deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`      |
+| deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`                                  |
-| deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`|
+| deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`|
-| flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                        |
+| flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                                                    |
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@ -82,7 +82,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:
 Related PRs:
- <gh-pr:8823>
+- <https://github.com/vllm-project/vllm/pull/8823>
 ## Prior State in v1
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@ -112,8 +112,8 @@ class KVCacheBlock:
    ref_cnt: int
    # The pointers to form a doubly linked list for the free queue.
-    prev_free_block: Optional["KVCacheBlock"] = None
+    prev_free_block: "KVCacheBlock | None" = None
-    next_free_block: Optional["KVCacheBlock"] = None
+    next_free_block: "KVCacheBlock | None" = None
 ```
 There are two design points to highlight:
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@ -19,8 +19,8 @@ vLLM will take all the available factors into consideration, and decide a direct
 The factors considered include:
- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](gh-file:vllm/config))
+- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](../../vllm/config))
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
+- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](../../vllm/compilation/compiler_interface.py))
 - The model's forward function and the relevant functions called by the forward function (see below)
 With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
--- a/docs/features/README.md
+++ b/docs/features/README.md
@ -44,15 +44,15 @@ th:not(:first-child) {
 | [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
 | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
 | [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ✅ | ✅ | ✅ | | | | | | | | |
 | <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
 | <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
 | multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
-| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
+| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/pull/4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
-| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
+| best-of | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ✅ | ✅ | | |
-| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
+| beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
-| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
+| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](https://github.com/vllm-project/vllm/issues/25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
 \* Chunked prefill and prefix caching are only applicable to last-token pooling.  
 <sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
@ -61,20 +61,20 @@ th:not(:first-child) {
 ### Feature x Hardware
-| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU |
+| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU | Intel GPU |
-|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
+|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------|
-| [CP][chunked-prefill]                                     | [❌](gh-issue:2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [CP][chunked-prefill]                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [APC](automatic_prefix_caching.md)                        | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
-| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ |
+| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
-| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ | ✅        |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ | ✅        |
-| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     | ❌ |
+| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ❌ | ✅        |
-| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ?     | [❌](gh-issue:25097) |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ?     | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅       |
--- a/docs/features/automatic_prefix_caching.md
+++ b/docs/features/automatic_prefix_caching.md
@ -11,7 +11,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries,
 Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
-<gh-file:examples/offline_inference/automatic_prefix_caching.py>
+[examples/offline_inference/automatic_prefix_caching.py](../../examples/offline_inference/automatic_prefix_caching.py)
 ## Example workloads
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@ -17,14 +17,14 @@ Two main reasons:
 ## Usage example
-Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
+Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.
 Now supports 5 types of connectors:
- **SharedStorageConnector**: refer to <gh-file:examples/offline_inference/disaggregated-prefill-v1/run.sh> for the example usage of SharedStorageConnector disaggregated prefilling.
+- **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
- **LMCacheConnectorV1**: refer to <gh-file:examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh> for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
+- **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
+- **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
- **P2pNcclConnector**: refer to <gh-file:examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh> for the example usage of P2pNcclConnector disaggregated prefilling.
+- **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
  ```bash
@ -45,7 +45,7 @@ For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
 ## Benchmarks
-Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
+Please refer to [benchmarks/disagg_benchmarks](../../benchmarks/disagg_benchmarks) for disaggregated prefilling benchmarks.
 ## Development
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter.
    sampling_params = SamplingParams(
        temperature=0,
        max_tokens=256,
-        stop=["[/assistant]"]
+        stop=["[/assistant]"],
    )
    prompts = [
@ -43,11 +43,11 @@ the third parameter is the path to the LoRA adapter.
    outputs = llm.generate(
        prompts,
        sampling_params,
-        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path),
    )
    ```
-Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out [examples/offline_inference/multilora_inference.py](../../examples/offline_inference/multilora_inference.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
 ## Serving LoRA Adapters
@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin:
                lora_request = LoRARequest(
                    lora_name=lora_name,
                    lora_path=local_path,
-                    lora_int_id=abs(hash(lora_name))
+                    lora_int_id=abs(hash(lora_name)),
                )
                return lora_request
        ```
@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
        if has_audio:
            question = f"<|audio|>{question}"
        chat = [
-            {
+            {"role": "user", "content": question},
                "role": "user",
                "content": question
            }
        ]
        return tokenizer.apply_chat_template(chat, tokenize=False)
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -3,7 +3,7 @@
 This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
 !!! note
-    We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
+    We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes,
    and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
 !!! tip
@ -129,7 +129,7 @@ You can pass a single image to the `'image'` field of the multi-modal dictionary
        print(generated_text)
    ```
-Full example: <gh-file:examples/offline_inference/vision_language.py>
+Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
    outputs = llm.generate({
        "prompt": prompt,
-        "multi_modal_data": {
+        "multi_modal_data": {"image": [image1, image2]},
            "image": [image1, image2]
        },
    })
    for o in outputs:
@ -164,7 +162,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
        print(generated_text)
    ```
-Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
+Full example: [examples/offline_inference/vision_language_multi_image.py](../../examples/offline_inference/vision_language_multi_image.py)
 If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
@ -183,21 +181,24 @@ conversation = [
    {"role": "assistant", "content": "Hello! How can I assist you today?"},
    {
        "role": "user",
-        "content": [{
+        "content": [
-            "type": "image_url",
+            {
-            "image_url": {
+                "type": "image_url",
-                "url": image_url
+                "image_url": {"url": image_url},
-            }
+            },
-        },{
+            {
-            "type": "image_pil",
+                "type": "image_pil",
-            "image_pil": image_pil
+                "image_pil": image_pil,
-        }, {
+            },
-            "type": "image_embeds",
+            {
-            "image_embeds": image_embeds
+                "type": "image_embeds",
-        }, {
+                "image_embeds": image_embeds,
-            "type": "text",
+            },
-            "text": "What's in these images?"
+            {
-        }],
+                "type": "text",
                "text": "What's in these images?",
            },
        ],
    },
 ]
@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with
    message = {
        "role": "user",
        "content": [
-            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
+            {
                "type": "text",
                "text": "Describe this set of frames. Consider the frames to be a part of the same video.",
            },
        ],
    }
    for i in range(len(video_frames)):
@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
    # Custom black background for dark theme
    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
-        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}},
    )
    # Custom brand color background (e.g., blue)
    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
-        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}},
    )
    ```
@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
        limit_mm_per_prompt={"video": 1},
    )
-    sampling_params = SamplingParams(
+    sampling_params = SamplingParams(max_tokens=1024)
        max_tokens=1024,
    )
    video_messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
+        {
-        {"role": "user", "content": [
+            "role": "system",
            "content": "You are a helpful assistant.",
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "describe this video."},
                {
                    "type": "video",
                    "video": video_path,
                    "total_pixels": 20480 * 28 * 28,
-                    "min_pixels": 16 * 28 * 28
+                    "min_pixels": 16 * 28 * 28,
-                }
+                },
            ]
        },
    ]
@ -339,13 +346,13 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
    !!! note
        'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
-Full example: <gh-file:examples/offline_inference/vision_language.py>
+Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)
 ### Audio Inputs
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
-Full example: <gh-file:examples/offline_inference/audio_language.py>
+Full example: [examples/offline_inference/audio_language.py](../../examples/offline_inference/audio_language.py)
 ### Embedding Inputs
@ -427,11 +434,11 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
    A chat template is **required** to use Chat Completions API.
    For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
-    If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
+    If no default chat template is available, we will first look for a built-in fallback in [vllm/transformers_utils/chat_templates/registry.py](../../vllm/transformers_utils/chat_templates/registry.py).
    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
-    For certain models, we provide alternative chat templates inside <gh-dir:examples>.
+    For certain models, we provide alternative chat templates inside [examples](../../examples).
-    For example, VLM2Vec uses <gh-file:examples/template_vlm2vec_phi3v.jinja> which is different from the default one for Phi-3-Vision.
+    For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
 ### Image Inputs
@ -465,21 +472,24 @@ Then, you can use the OpenAI client as follows:
    chat_response = client.chat.completions.create(
        model="microsoft/Phi-3.5-vision-instruct",
-        messages=[{
+        messages=[
-            "role": "user",
+            {
-            "content": [
+                "role": "user",
-                # NOTE: The prompt formatting with the image token `<image>` is not needed
+                "content": [
-                # since the prompt will be processed automatically by the API server.
+                    # NOTE: The prompt formatting with the image token `<image>` is not needed
-                {"type": "text", "text": "What’s in this image?"},
+                    # since the prompt will be processed automatically by the API server.
-                {
+                    {
-                    "type": "image_url",
+                        "type": "text",
-                    "image_url": {
+                        "text": "What’s in this image?",
                        url": image_url
                    },
-                    "uuid": image_url # Optional
+                    {
-                },
+                        "type": "image_url",
-            ],
+                        "image_url": {"url": image_url},
-        }],
+                        "uuid": image_url,  # Optional
                    },
                ],
            }
        ],
    )
    print("Chat completion output:", chat_response.choices[0].message.content)
@ -489,31 +499,32 @@ Then, you can use the OpenAI client as follows:
    chat_response = client.chat.completions.create(
        model="microsoft/Phi-3.5-vision-instruct",
-        messages=[{
+        messages=[
-            "role": "user",
+            {
-            "content": [
+                "role": "user",
-                {"type": "text", "text": "What are the animals in these images?"},
+                "content": [
-                {
+                    {
-                    "type": "image_url",
+                        "type": "text",
-                    "image_url": {
+                        "text": "What are the animals in these images?",
                        "url": image_url_duck
                    },
-                    "uuid": image_url_duck # Optional
+                    {
-                },
+                        "type": "image_url",
-                {
+                        "image_url": {"url": image_url_duck},
-                    "type": "image_url",
+                        "uuid": image_url_duck,  # Optional
                    "image_url": {
                        "url": image_url_lion
                    },
-                    "uuid": image_url_lion # Optional
+                    {
-                },
+                        "type": "image_url",
-            ],
+                        "image_url": {"url": image_url_lion},
-        }],
+                        "uuid": image_url_lion,  # Optional
                    },
                ],
            }
        ],
    )
    print("Chat completion output:", chat_response.choices[0].message.content)
    ```
-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
 !!! tip
    Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
@ -560,23 +571,22 @@ Then, you can use the OpenAI client as follows:
    ## Use video url in the payload
    chat_completion_from_url = client.chat.completions.create(
-        messages=[{
+        messages=[
-            "role":
+            {
-            "user",
+                "role": "user",
-            "content": [
+                "content": [
-                {
+                    {
-                    "type": "text",
+                        "type": "text",
-                    "text": "What's in this video?"
+                        "text": "What's in this video?",
                },
                {
                    "type": "video_url",
                    "video_url": {
                        "url": video_url
                    },
-                    "uuid": video_url # Optional
+                    {
-                },
+                        "type": "video_url",
-            ],
+                        "video_url": {"url": video_url},
-        }],
+                        "uuid": video_url,  # Optional
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=64,
    )
@ -585,7 +595,7 @@ Then, you can use the OpenAI client as follows:
    print("Chat completion output from image url:", result)
    ```
-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
 !!! note
    By default, the timeout for fetching videos through HTTP URL is `30` seconds.
@ -652,23 +662,25 @@ Then, you can use the OpenAI client as follows:
    audio_base64 = encode_base64_content_from_url(audio_url)
    chat_completion_from_base64 = client.chat.completions.create(
-        messages=[{
+        messages=[
-            "role": "user",
+            {
-            "content": [
+                "role": "user",
-                {
+                "content": [
-                    "type": "text",
+                    {
-                    "text": "What's in this audio?"
+                        "type": "text",
-                },
+                        "text": "What's in this audio?",
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": audio_base64,
                        "format": "wav"
                    },
-                    "uuid": audio_url # Optional
+                    {
-                },
+                        "type": "input_audio",
-            ],
+                        "input_audio": {
-        }],
+                            "data": audio_base64,
                            "format": "wav",
                        },
                        "uuid": audio_url,  # Optional
                    },
                ],
            },
        ],
        model=model,
        max_completion_tokens=64,
    )
@ -683,22 +695,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
    ```python
    chat_completion_from_url = client.chat.completions.create(
-        messages=[{
+        messages=[
-            "role": "user",
+            {
-            "content": [
+                "role": "user",
-                {
+                "content": [
-                    "type": "text",
+                    {
-                    "text": "What's in this audio?"
+                        "type": "text",
-                },
+                        "text": "What's in this audio?",
                {
                    "type": "audio_url",
                    "audio_url": {
                        "url": audio_url
                    },
-                    "uuid": audio_url # Optional
+                    {
-                },
+                        "type": "audio_url",
-            ],
+                        "audio_url": {"url": audio_url},
-        }],
+                        "uuid": audio_url,  # Optional
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=64,
    )
@ -707,7 +719,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
    print("Chat completion output from audio url:", result)
    ```
-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
 !!! note
    By default, the timeout for fetching audios through HTTP URL is `10` seconds.
@ -747,43 +759,48 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
    # Basic usage - this is equivalent to the LLaVA example for offline inference
    model = "llava-hf/llava-1.5-7b-hf"
-    embeds =  {
+    embeds = {
        "type": "image_embeds",
        "image_embeds": f"{base64_image_embedding}",
-        "uuid": image_url # Optional
+        "uuid": image_url,  # Optional
    }
    # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
    model = "Qwen/Qwen2-VL-2B-Instruct"
-    embeds =  {
+    embeds = {
        "type": "image_embeds",
        "image_embeds": {
-            "image_embeds": f"{base64_image_embedding}" , # Required
+            "image_embeds": f"{base64_image_embedding}",  # Required
-            "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
+            "image_grid_thw": f"{base64_image_grid_thw}",  # Required by Qwen/Qwen2-VL-2B-Instruct
        },
-        "uuid": image_url # Optional
+        "uuid": image_url,  # Optional
    }
    model = "openbmb/MiniCPM-V-2_6"
-    embeds =  {
+    embeds = {
        "type": "image_embeds",
        "image_embeds": {
-            "image_embeds": f"{base64_image_embedding}" , # Required
+            "image_embeds": f"{base64_image_embedding}",  # Required
-            "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
+            "image_sizes": f"{base64_image_sizes}",  # Required by openbmb/MiniCPM-V-2_6
        },
-        "uuid": image_url # Optional
+        "uuid": image_url,  # Optional
    }
    chat_completion = client.chat.completions.create(
        messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            {
-                "type": "text",
+                "role": "system",
-                "text": "What's in this image?",
+                "content": "You are a helpful assistant.",
            },
-            embeds,
+            {
-            ],
+                "role": "user",
-        },
+                "content": [
-    ],
+                    {
                        "type": "text",
                        "text": "What's in this image?",
                    },
                    embeds,
                ],
            },
        ],
        model=model,
    )
    ```
@ -802,22 +819,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
        {
            "type": "image_embeds",
            "image_embeds": None,
-            "uuid": image_uuid
+            "uuid": image_uuid,
        },
        # input_audio:
        {
            "type": "input_audio",
            "input_audio": None,
-            "uuid": audio_uuid
+            "uuid": audio_uuid,
        },
        # PIL Image:
        {
            "type": "image_pil",
-            "image_pil": None
+            "image_pil": None,
-            "uuid": image_uuid
+            "uuid": image_uuid,
-        }
+        },
    ```
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
 Install the NIXL library: `uv pip install nixl`, as a quick start.
 - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
+- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
 For non-cuda platform, please install nixl with ucx build from source, instructed as below.
@ -156,10 +156,20 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
    NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`).
    Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior.
 ## Experimental Feature
 ### Heterogenuous KV Layout support
 Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration
 ```bash
 --kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
 ```
 ## Example Scripts/Code
 Refer to these example scripts in the vLLM repository:
- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
+- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
+- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
+- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
--- a/docs/features/prompt_embeds.md
+++ b/docs/features/prompt_embeds.md
@ -16,7 +16,7 @@ To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
 You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
-<gh-file:examples/offline_inference/prompt_embed_inference.py>
+[examples/offline_inference/prompt_embed_inference.py](../../examples/offline_inference/prompt_embed_inference.py)
 ## Online Serving
@ -37,4 +37,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \
 Then, you can use the OpenAI client as follows:
-<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
+[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/online_serving/prompt_embed_inference_with_openai_client.py)
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@ -64,4 +64,4 @@ th:not(:first-child) {
 !!! note
    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
-    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
+    For the most up-to-date information on hardware support and quantization methods, please refer to [vllm/model_executor/layers/quantization](../../../vllm/model_executor/layers/quantization) or consult with the vLLM development team.
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@ -1,5 +1,9 @@
 # AutoAWQ
 > ⚠️ **Warning:**
    The `AutoAWQ` library is deprecated. This functionality has been adopted by the vLLM project in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq).
    For the recommended quantization workflow, please see the AWQ examples in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq). For more details on the deprecation, refer to the original [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ).
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
 The main benefits are lower latency and memory usage.
@ -18,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
    from awq import AutoAWQForCausalLM
    from transformers import AutoTokenizer
-    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+    model_path = "mistralai/Mistral-7B-Instruct-v0.2"
-    quant_path = 'mistral-instruct-v0.2-awq'
+    quant_path = "mistral-instruct-v0.2-awq"
-    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
    # Load model
    model = AutoAWQForCausalLM.from_pretrained(
-        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+        model_path,
        low_cpu_mem_usage=True,
        use_cache=False,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
--- a/docs/features/quantization/auto_round.md
+++ b/docs/features/quantization/auto_round.md
@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 model_name = "Qwen/Qwen3-0.6B"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 bits, group_size, sym = 4, 128, True
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@ -34,7 +34,7 @@ llm = LLM(
    model=model_id,
    dtype=torch.bfloat16,
    trust_remote_code=True,
-    quantization="bitblas"
+    quantization="bitblas",
 )
 ```
@ -53,6 +53,6 @@ llm = LLM(
        dtype=torch.float16,
        trust_remote_code=True,
        quantization="bitblas",
-        max_model_len=1024
+        max_model_len=1024,
    )
    ```
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(
    model=model_id,
    dtype=torch.bfloat16,
-    trust_remote_code=True
+    trust_remote_code=True,
 )
 ```
@ -43,7 +43,7 @@ llm = LLM(
    model=model_id,
    dtype=torch.bfloat16,
    trust_remote_code=True,
-    quantization="bitsandbytes"
+    quantization="bitsandbytes",
 )
 ```
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
    device_map="auto",
    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
    # Configure the simple PTQ quantization
    recipe = QuantizationModifier(
-      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+        targets="Linear",
        scheme="FP8_DYNAMIC",
        ignore=["lm_head"],
    )
    # Apply the quantization algorithm.
    oneshot(model=model, recipe=recipe)
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
      conversation = [
         {
            "role": "system",
-            "content": "You are a helpful assistant"
+            "content": "You are a helpful assistant",
         },
         {
            "role": "user",
-            "content": "Hello"
+            "content": "Hello",
         },
         {
            "role": "assistant",
-            "content": "Hello! How can I assist you today?"
+            "content": "Hello! How can I assist you today?",
         },
         {
            "role": "user",
@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
      # Create an LLM.
-      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+      llm = LLM(
-               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+         model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
      )
      # Generate texts from the prompts. The output is a list of RequestOutput objects
      # that contain the prompt, generated text, and other information.
      outputs = llm.chat(conversation, sampling_params)
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
    calibration_dataset = load_dataset(
        "allenai/c4",
        data_files="en/c4-train.00001-of-01024.json.gz",
-        split="train"
+        split="train",
    ).select(range(1024))["text"]
    quant_config = QuantizeConfig(bits=4, group_size=128)
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
    device_map="auto",
    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
        },
        ignore=["lm_head"],
        update_size=NUM_CALIBRATION_SAMPLES,
-        dampening_frac=0.01
+        dampening_frac=0.01,
    )
    ```
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
    device_map="auto",
    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
    from vllm import LLM, SamplingParams
    def main():
        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
-        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+
        # Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
        llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
    from vllm import LLM, SamplingParams
    sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+    llm = LLM(
-            kv_cache_dtype="fp8",
+        model="meta-llama/Llama-2-7b-chat-hf",
-            calculate_kv_scales=True)
+        kv_cache_dtype="fp8",
        calculate_kv_scales=True,
    )
    prompt = "London is the capital of"
    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
    print(out)
@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
    # Select model and load it
    MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    # Select calibration dataset
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@ -48,7 +48,9 @@ to fetch model and tokenizer.
    MAX_SEQ_LEN = 512
    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID, device_map="auto", torch_dtype="auto",
+        MODEL_ID,
        device_map="auto",
        dtype="auto",
    )
    model.eval()
@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
    dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
    text_data = dataset["text"][:NUM_CALIBRATION_DATA]
-    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+    tokenized_outputs = tokenizer(
-        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+        text_data,
-    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+        return_tensors="pt",
-        batch_size=BATCH_SIZE, drop_last=True)
+        padding=True,
        truncation=True,
        max_length=MAX_SEQ_LEN,
    )
    calib_dataloader = DataLoader(
        tokenized_outputs['input_ids'],
        batch_size=BATCH_SIZE,
        drop_last=True,
    )
    ```
 ### 3. Set the Quantization Configuration
@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
                                        load_quant_algo_config_from_file)
    # Define fp8/per-tensor/static spec.
-    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
-        is_dynamic=False).to_quantization_spec()
+        observer_method="min_max",
        is_dynamic=False,
    ).to_quantization_spec()
    # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+    global_quant_config = QuantizationConfig(
-        weight=FP8_PER_TENSOR_SPEC)
+        input_tensors=FP8_PER_TENSOR_SPEC,
        weight=FP8_PER_TENSOR_SPEC,
    )
    # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
    KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
    kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-    kv_cache_quant_config = {name :
+    kv_cache_quant_config = {
-        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+        name: QuantizationConfig(
-                        weight=global_quant_config.weight,
+            input_tensors=global_quant_config.input_tensors,
-                        output_tensors=KV_CACHE_SPEC)
+            weight=global_quant_config.weight,
-        for name in kv_cache_layer_names_for_llama}
+            output_tensors=KV_CACHE_SPEC,
        )
        for name in kv_cache_layer_names_for_llama
    }
    layer_quant_config = kv_cache_quant_config.copy()
    # Define algorithm config by config file.
-    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
    algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
    EXCLUDE_LAYERS = ["lm_head"]
@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
        layer_quant_config=layer_quant_config,
        kv_cache_quant_config=kv_cache_quant_config,
        exclude=EXCLUDE_LAYERS,
-        algo_config=algo_config)
+        algo_config=algo_config,
    )
    ```
 ### 4. Quantize the Model and Export
@ -165,8 +182,11 @@ for more exporting format details.
    EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
    exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
    with torch.no_grad():
-        exporter.export_safetensors_model(freezed_model,
+        exporter.export_safetensors_model(
-            quant_config=quant_config, tokenizer=tokenizer)
+            freezed_model,
            quant_config=quant_config,
            tokenizer=tokenizer,
        )
    ```
 ### 5. Evaluation in vLLM
@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    # Create an LLM.
-    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+    llm = LLM(
-            kv_cache_dtype='fp8',quantization='quark')
+        model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
        kv_cache_dtype="fp8",
        quantization="quark",
    )
    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
    quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
    quantized_model = AutoModelForCausalLM.from_pretrained(
        model_name,
-        torch_dtype="auto",
+        dtype="auto",
        device_map="auto",
        quantization_config=quantization_config
    )
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -11,6 +11,9 @@ vLLM currently supports the following reasoning models:
 | Model Series | Parser Name | Structured Output Support | Tool Calling |
 |--------------|-------------|------------------|-------------|
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
 | [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
 | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
 | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
@ -18,8 +21,9 @@ vLLM currently supports the following reasoning models:
 | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
 !!! note
-    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
+    IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
    The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
    DeepSeek-V3.1 tool calling is supported in non-thinking mode.
 ## Quickstart
@ -115,9 +119,11 @@ OpenAI Python client library does not officially support `reasoning_content` att
    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
-    stream = client.chat.completions.create(model=model,
+    stream = client.chat.completions.create(
-                                            messages=messages,
+        model=model,
-                                            stream=True)
+        messages=messages,
        stream=True,
    )
    print("client: Start streaming chat completions...")
    printed_reasoning_content = False
@ -157,27 +163,29 @@ The reasoning content is also available when both tool calling and the reasoning
    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
-    tools = [{
+    tools = [
-        "type": "function",
+        {
-        "function": {
+            "type": "function",
-            "name": "get_weather",
+            "function": {
-            "description": "Get the current weather in a given location",
+                "name": "get_weather",
-            "parameters": {
+                "description": "Get the current weather in a given location",
-                "type": "object",
+                "parameters": {
-                "properties": {
+                    "type": "object",
-                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                    "properties": {
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                        "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                },
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
-                "required": ["location", "unit"]
+                    },
-            }
+                    "required": ["location", "unit"],
                }
            },
        }
-    }]
+    ]
    response = client.chat.completions.create(
        model=client.models.list().data[0].id,
        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
        tools=tools,
-        tool_choice="auto"
+        tool_choice="auto",
    )
    print(response)
@ -188,7 +196,7 @@ The reasoning content is also available when both tool calling and the reasoning
    print(f"Arguments: {tool_call.arguments}")
    ```
-For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
+For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py).
 ## Limitations
@ -196,7 +204,7 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
 ## How to support a new reasoning model
-You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
 ??? code
@ -223,7 +231,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
            previous_token_ids: Sequence[int],
            current_token_ids: Sequence[int],
            delta_token_ids: Sequence[int],
-        ) -> Union[DeltaMessage, None]:
+        ) -> DeltaMessage | None:
            """
            Instance method that should be implemented for extracting reasoning
            from an incomplete response; for use when handling reasoning calls and
@ -233,8 +241,10 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
            """
        def extract_reasoning_content(
-                self, model_output: str, request: ChatCompletionRequest
+            self,
-        ) -> tuple[Optional[str], Optional[str]]:
+            model_output: str,
            request: ChatCompletionRequest | ResponsesRequest,
        ) -> tuple[str | None, str | None]:
            """
            Extract reasoning content from a complete model-generated string.
@ -254,7 +264,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
            """
    ```
-Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
 ??? code
@ -272,10 +282,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
        @classmethod
        def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-            return cls(start_token_id=tokenizer.encode(
+            return cls(
-                "<think>", add_special_tokens=False)[0],
+                start_token_id=tokenizer.encode("<think>", add_special_tokens=False)[0],
-                    end_token_id=tokenizer.encode("</think>",
+                end_token_id=tokenizer.encode("</think>", add_special_tokens=False)[0],
-                                                    add_special_tokens=False)[0])
+            )
        def is_reasoning_end(self, input_ids: list[int]) -> bool:
            return self.end_token_id in input_ids
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@ -3,7 +3,7 @@
 !!! warning
    Please note that speculative decoding in vLLM is not yet optimized and does
    not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
-    The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
+    The work to optimize it is ongoing and can be followed here: <https://github.com/vllm-project/vllm/issues/4630>
 !!! warning
    Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
@ -183,7 +183,7 @@ A variety of speculative models of this type are available on HF hub:
 ## Speculating using EAGLE based draft models
 The following code configures vLLM to use speculative decoding where proposals are generated by
-an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](../../examples/offline_inference/spec_decode.py).
 ??? code
@ -218,8 +218,8 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
 A few important things to consider when using the EAGLE based draft models:
 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
-   be able to be loaded and used directly by vLLM after <gh-pr:12304>.
+   be able to be loaded and used directly by vLLM after <https://github.com/vllm-project/vllm/pull/12304>.
-   If you are using vllm version before <gh-pr:12304>, please use the
+   If you are using vllm version before <https://github.com/vllm-project/vllm/pull/12304>, please use the
   [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
   and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
@ -229,7 +229,7 @@ A few important things to consider when using the EAGLE based draft models:
 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
   reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
-   investigation and tracked here: <gh-issue:9565>.
+   investigation and tracked here: <https://github.com/vllm-project/vllm/issues/9565>.
 4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
   That is, to specify `"method": "eagle3"` in `speculative_config`.
@ -267,7 +267,7 @@ speculative decoding, breaking down the guarantees into three key areas:
    >   distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
    > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
    >   without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
-    >   provides a lossless guarantee. Almost all of the tests in <gh-dir:tests/spec_decode/e2e>.
+    >   provides a lossless guarantee. Almost all of the tests in [tests/spec_decode/e2e](../../tests/spec_decode/e2e).
    >   verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
 3. **vLLM Logprob Stability**
@ -289,4 +289,4 @@ For mitigation strategies, please refer to the FAQ entry *Can the output of a pr
 - [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
 - [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
 - [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
- [Dynamic speculative decoding](gh-issue:4565)
+- [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565)
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -298,7 +298,7 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
 Answer: x = -29/8
 ```
-An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/structured_outputs>
+An example of using `structural_tag` can be found here: [examples/online_serving/structured_outputs](../../examples/online_serving/structured_outputs)
 ## Offline Inference
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -27,27 +27,29 @@ Next, make a request that triggers the model to use the available tools:
        return f"Getting the weather for {location} in {unit}..."
    tool_functions = {"get_weather": get_weather}
-    tools = [{
+    tools = [
-        "type": "function",
+        {
-        "function": {
+            "type": "function",
-            "name": "get_weather",
+            "function": {
-            "description": "Get the current weather in a given location",
+                "name": "get_weather",
-            "parameters": {
+                "description": "Get the current weather in a given location",
-                "type": "object",
+                "parameters": {
-                "properties": {
+                    "type": "object",
-                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                    "properties": {
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                        "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                    },
                    "required": ["location", "unit"],
                },
-                "required": ["location", "unit"]
+            },
-            }
+        },
-        }
+    ]
    }]
    response = client.chat.completions.create(
        model=client.models.list().data[0].id,
        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
        tools=tools,
-        tool_choice="auto"
+        tool_choice="auto",
    )
    tool_call = response.choices[0].message.tool_calls[0].function
@ -149,9 +151,9 @@ Known issues:
   much shorter than what vLLM generates. Since an exception is thrown when this condition
   is not met, the following additional chat templates are provided:
-    * <gh-file:examples/tool_chat_template_mistral.jinja> - this is the "official" Mistral chat template, but tweaked so that
+    * [examples/tool_chat_template_mistral.jinja](../../examples/tool_chat_template_mistral.jinja) - this is the "official" Mistral chat template, but tweaked so that
      it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
-    * <gh-file:examples/tool_chat_template_mistral_parallel.jinja> - this is a "better" version that adds a tool-use system prompt
+    * [examples/tool_chat_template_mistral_parallel.jinja](../../examples/tool_chat_template_mistral_parallel.jinja) - this is a "better" version that adds a tool-use system prompt
      when tools are provided, that results in much better reliability when working with parallel tool calling.
 Recommended flags:
@ -185,16 +187,16 @@ Known issues:
 VLLM provides two JSON-based chat templates for Llama 3.1 and 3.2:
-* <gh-file:examples/tool_chat_template_llama3.1_json.jinja> - this is the "official" chat template for the Llama 3.1
+* [examples/tool_chat_template_llama3.1_json.jinja](../../examples/tool_chat_template_llama3.1_json.jinja) - this is the "official" chat template for the Llama 3.1
 models, but tweaked so that it works better with vLLM.
-* <gh-file:examples/tool_chat_template_llama3.2_json.jinja> - this extends upon the Llama 3.1 chat template by adding support for
+* [examples/tool_chat_template_llama3.2_json.jinja](../../examples/tool_chat_template_llama3.2_json.jinja) - this extends upon the Llama 3.1 chat template by adding support for
 images.
 Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
 VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pythonic tool calling is recommended:
-* <gh-file:examples/tool_chat_template_llama4_pythonic.jinja> - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
+* [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja) - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
 For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
@ -210,7 +212,7 @@ Supported models:
    Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
-    <gh-file:examples/tool_chat_template_granite.jinja>: this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
+    [examples/tool_chat_template_granite.jinja](../../examples/tool_chat_template_granite.jinja): this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
 * `ibm-granite/granite-3.1-8b-instruct`
@ -222,7 +224,7 @@ Supported models:
    Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
-    <gh-file:examples/tool_chat_template_granite_20b_fc.jinja>: this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
+    [examples/tool_chat_template_granite_20b_fc.jinja](../../examples/tool_chat_template_granite_20b_fc.jinja): this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
 ### InternLM Models (`internlm`)
@ -280,8 +282,8 @@ Flags: `--tool-call-parser hermes`
 Supported models:
-* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
+* `MiniMaxAi/MiniMax-M1-40k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
-* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
+* `MiniMaxAi/MiniMax-M1-80k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
 Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`
@ -289,8 +291,8 @@ Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_m
 Supported models:
-* `deepseek-ai/DeepSeek-V3-0324` (use with <gh-file:examples/tool_chat_template_deepseekv3.jinja>)
+* `deepseek-ai/DeepSeek-V3-0324` (use with [examples/tool_chat_template_deepseekv3.jinja](../../examples/tool_chat_template_deepseekv3.jinja))
-* `deepseek-ai/DeepSeek-R1-0528` (use with <gh-file:examples/tool_chat_template_deepseekr1.jinja>)
+* `deepseek-ai/DeepSeek-R1-0528` (use with [examples/tool_chat_template_deepseekr1.jinja](../../examples/tool_chat_template_deepseekr1.jinja))
 Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
@ -298,7 +300,7 @@ Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
 Supported models:
-* `deepseek-ai/DeepSeek-V3.1` (use with <gh-file:examples/tool_chat_template_deepseekv31.jinja>)
+* `deepseek-ai/DeepSeek-V3.1` (use with [examples/tool_chat_template_deepseekv31.jinja](../../examples/tool_chat_template_deepseekv31.jinja))
 Flags: `--tool-call-parser deepseek_v31 --chat-template {see_above}`
@ -350,6 +352,16 @@ Supported models:
 Flags: `--tool-call-parser qwen3_xml`
 ### Olmo 3 Models (`olmo3`)
 Olmo 3 models output tool calls in a format that is very similar to the one expected by the `pythonic` parser (see below), with a few differences. Each tool call is a pythonic string, but the parallel tool calls are newline-delimited, and the calls are wrapped within XML tags as `<function_calls>..</function_calls>`. In addition, the parser also allows JSON boolean and null literals (`true`, `false`, and `null`) in addition to the pythonic ones (`True`, `False`, and `None`).
 Supported models:
 * TODO (will be updated after Olmo 3 release)
 Flags: `--tool-call-parser olmo3`
 ### Models with Pythonic Tool Calls (`pythonic`)
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
@ -367,12 +379,12 @@ Limitations:
 Example supported models:
-* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
+* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
-* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
+* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
-* `Team-ACE/ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
+* `Team-ACE/ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
-* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
+* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
-* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
+* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
-* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
+* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
 Flags: `--tool-call-parser pythonic --chat-template {see_above}`
@ -381,7 +393,7 @@ Flags: `--tool-call-parser pythonic --chat-template {see_above}`
 ## How to Write a Tool Parser Plugin
-A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in <gh-file:vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py>.
+A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py](../../vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py).
 Here is a summary of a plugin file:
@ -402,8 +414,7 @@ Here is a summary of a plugin file:
        # adjust request. e.g.: set skip special tokens
        # to False for tool call output.
-        def adjust_request(
+        def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
                self, request: ChatCompletionRequest) -> ChatCompletionRequest:
            return request
        # implement the tool call parse for stream call
@ -416,7 +427,7 @@ Here is a summary of a plugin file:
            current_token_ids: Sequence[int],
            delta_token_ids: Sequence[int],
            request: ChatCompletionRequest,
-        ) -> Union[DeltaMessage, None]:
+        ) -> DeltaMessage | None:
            return delta
        # implement the tool parse for non-stream call
--- a/Show More
+++ b/Show More
		`@ -0,0 +1 @@`
							`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`