AITER MHA off by default

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
update base image for RC
2025-10-22 14:49:01 -07:00 · 2025-10-22 14:49:01 -07:00 · 2025-10-22 14:49:00 -07:00 · 2025-10-22 20:25:25 +00:00 · 2025-10-22 19:36:39 +00:00 · 2025-10-22 19:19:33 +00:00
683 changed files with 26189 additions and 10909 deletions
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@ -1,11 +1,12 @@
 # For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 backend: "vllm-vlm"
 tasks:
 - name: "chartqa"
  metrics:
  - name: "relaxed_accuracy,none"
-    value: 0.90
+    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
    value: 0.80
 limit: 100
 num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -1,7 +1,6 @@
 # For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
+# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 backend: "vllm-vlm"
 tasks:
 - name: "mmlu_pro"
  metrics:
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,5 +1,5 @@
 steps:
-  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  # aarch64 + CUDA builds
  - label: "Build arm64 wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-arm64-cuda-12-9
@ -15,6 +15,21 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
  # aarch64 build
  - label: "Build arm64 CPU wheel"
    depends_on: ~
    id: build-wheel-arm64-cpu
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.8"
    depends_on: ~
    id: build-wheel-cuda-12-8
@ -28,20 +43,6 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 12.6"
    depends_on: ~
    id: build-wheel-cuda-12-6
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
@ -55,6 +56,20 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 13.0"
    depends_on: ~
    id: build-wheel-cuda-13-0
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # Build release images (12.9)
  - label: "Build release image (x86)"
    depends_on: ~
    id: build-release-image-x86
@ -62,13 +77,12 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
      # re-tag to default image tag and push, just in case arm64 build fails
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build release image (arm64)"
    depends_on: ~
    id: build-release-image-arm64
@ -142,6 +156,22 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
  - block: "Build arm64 CPU release image"
    key: block-arm64-cpu-release-image-build
    depends_on: ~
  - label: "Build and publish arm64 CPU release image"
    depends_on: block-arm64-cpu-release-image-build
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build and publish nightly multi-arch image to DockerHub"
    depends_on:
      - create-multi-arch-manifest
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -70,7 +70,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
  # Note: disable it until supports V1
  # Run AWQ test
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-if [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu129"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
 elif [[ $normal_wheel == *"cu128"* ]]; then
    # if $normal_wheel matches cu128, do not upload the index.html
    echo "Skipping index files for cu128 wheels"
 else
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 else
    echo "Skipping index files for non-cu129 wheels"
 fi
 # generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-if [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu129"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
 elif [[ $normal_wheel == *"cu128"* ]]; then
    # if $normal_wheel matches cu128, do not upload the index.html
    echo "Skipping index files for cu128 wheels"
 else
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 else
    echo "Skipping index files for non-cu129 wheels"
 fi
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -454,8 +454,8 @@ steps:
    - pytest -v -s compile/test_fusion_attn.py
    - pytest -v -s compile/test_functionalization.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/test_sequence_parallelism.py
+  #  - pytest -v -s compile/test_sequence_parallelism.py
-    - pytest -v -s compile/test_async_tp.py
+  #  - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
    - pytest -v -s compile/test_noop_elimination.py
@ -474,8 +474,8 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 20min
+- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 30
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
@ -485,6 +485,7 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
  - pytest -v -s compile/test_fusions_e2e.py
 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@ -494,6 +495,7 @@ steps:
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  commands:
    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
@ -606,7 +608,7 @@ steps:
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
  - uv pip install --system torchao==0.13.0
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -848,6 +850,18 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
@ -923,8 +937,8 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min
+- label: Blackwell Test # 21 min
-  timeout_in_minutes: 60
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -937,8 +951,6 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/fusion.py
  - vllm/compilation/fusion_attn.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@ -955,13 +967,32 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    # Fusion
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
 - label: Blackwell Fusion Tests # 30 min
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusions_e2e.py
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@ -1081,6 +1112,7 @@ steps:
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@ -1128,6 +1160,11 @@ steps:
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
  - pip uninstall dummy_stat_logger -y
  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@ -1172,7 +1209,6 @@ steps:
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
 - label: Weight Loading Multiple GPU Test  # 33min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
@ -1201,6 +1237,18 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
 ##### multi gpus test #####
 ##### A100 test #####
@ -1232,12 +1280,16 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 ##### H200 test #####
- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - pytest -v -s tests/compile/test_async_tp.py
    - pytest -v -s tests/compile/test_sequence_parallelism.py
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -172,6 +172,8 @@ steps:
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
@ -349,7 +351,8 @@ steps:
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@ -384,7 +387,12 @@ steps:
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
      --ignore=lora/test_chatglm3_tp.py \
      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py
+      --ignore=lora/test_llm_with_multi_loras.py \
      --ignore=lora/test_olmoe_tp.py \
      --ignore=lora/test_deepseekv2_tp.py \
      --ignore=lora/test_gptoss.py \
      --ignore=lora/test_qwen3moe_tp.py
  parallelism: 4
 - label: PyTorch Compilation Unit Tests # 15min
@ -416,8 +424,8 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 20min
+- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 30
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -425,6 +433,7 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
  - pytest -v -s compile/test_fusions_e2e.py
 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@ -528,8 +537,8 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -807,8 +816,8 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min
+- label: Blackwell Test # 21 min
-  timeout_in_minutes: 60
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -821,8 +830,6 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/fusion.py
  - vllm/compilation/fusion_attn.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@ -839,15 +846,32 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    # Fusion
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
 - label: Blackwell Fusion Tests # 30 min
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusions_e2e.py
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@ -954,6 +978,8 @@ steps:
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
@ -961,6 +987,7 @@ steps:
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@ -1004,6 +1031,11 @@ steps:
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
  - pip uninstall dummy_stat_logger -y
  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@ -1043,6 +1075,7 @@ steps:
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
    - pytest -v -s -x lora/test_olmoe_tp.py
 - label: Weight Loading Multiple GPU Test  # 33min
@ -1068,6 +1101,17 @@ steps:
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
 ##### multi gpus test #####
@ -1100,7 +1144,7 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 ##### H200 test #####
- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
@ -1108,6 +1152,8 @@ steps:
  commands:
    - pytest -v -s tests/compile/test_async_tp.py
    - pytest -v -s tests/compile/test_sequence_parallelism.py
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,8 +5,8 @@
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/model_executor/layers/fused_moe @mgoin
+/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
@ -25,7 +25,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # vLLM V1
 /vllm/v1/attention @LucasWilkinson
-/vllm/v1/attention/backends/flashinfer.py @mgoin
+/vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
@ -44,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
@ -57,7 +58,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/offloading @ApostaC
 # Transformers backend
-/vllm/model_executor/models/transformers.py @hmellor
+/vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor
 # Docs
--- a/.gitignore
+++ b/.gitignore
@ -94,6 +94,9 @@ ipython_config.py
 # generated files
 **/generated/**
 # uv
 uv.lock
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@ -4,7 +4,6 @@ MD013: false
 MD024:
  siblings_only: true
 MD033: false
 MD042: false
 MD045: false
 MD046: false
 MD051: false
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -38,7 +38,7 @@ repos:
  rev: 0.9.1
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
 #
 # Try to find python package with an executable that exactly matches
@ -883,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -31,6 +31,7 @@ import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from contextlib import nullcontext
 from dataclasses import dataclass
 import datasets
@ -501,15 +502,9 @@ async def benchmark(
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-    # This can be used once the minimum Python version is 3.10 or higher,
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
    # and it will simplify the code in limited_request_func.
    #    semaphore = (asyncio.Semaphore(max_concurrency)
    #                 if max_concurrency else contextlib.nullcontext())
    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
    async def limited_request_func(request_func_input, pbar):
        if semaphore is None:
            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
            return await request_func(request_func_input=request_func_input, pbar=pbar)
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -10,7 +10,8 @@ import torch
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 def with_triton_mode(fn):
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 batch_size_range = [1, 16, 32, 64, 128]
 seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -7,7 +7,8 @@ import torch
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -9,9 +9,9 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )
--- a/benchmarks/kernels/benchmark_polynorm.py
+++ b/benchmarks/kernels/benchmark_polynorm.py
@ -1,155 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 import torch
 from vllm import _custom_ops as vllm_ops
 from vllm.triton_utils import triton
 def polynorm_naive(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
    x = x.view(-1, x.shape[-1])
    def norm(x, eps: float):
        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
    x = x.float()
    return (
        (
            weight[0] * norm(x**3, eps)
            + weight[1] * norm(x**2, eps)
            + weight[2] * norm(x, eps)
            + bias
        )
        .to(weight.dtype)
        .view(orig_shape)
    )
 def polynorm_vllm(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
    x = x.view(-1, x.shape[-1])
    out = torch.empty_like(x)
    vllm_ops.poly_norm(out, x, weight, bias, eps)
    output = out
    output = output.view(orig_shape)
    return output
 def calculate_diff(batch_size, seq_len, hidden_dim):
    dtype = torch.bfloat16
    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
    weight = torch.ones(3, dtype=dtype, device="cuda")
    bias = torch.ones(1, dtype=dtype, device="cuda")
    output_naive = polynorm_naive(x, weight, bias)
    output_vllm = polynorm_vllm(x, weight, bias)
    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
        print("✅ All implementations match")
    else:
        print("❌ Implementations differ")
 batch_size_range = [2**i for i in range(0, 7, 2)]
 seq_length_range = [2**i for i in range(6, 11, 1)]
 dim_range = [2048, 4096]
 configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
 def get_benchmark():
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=["dim", "batch_size", "seq_len"],
            x_vals=[list(_) for _ in configs],
            line_arg="provider",
            line_vals=["naive", "vllm"],
            line_names=["Naive", "vLLM"],
            styles=[("blue", "-"), ("red", "-")],
            ylabel="us",
            plot_name="polynorm-perf",
            args={},
        )
    )
    def benchmark(dim, batch_size, seq_len, provider):
        dtype = torch.bfloat16
        hidden_dim = dim * 4
        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
        weight = torch.ones(3, dtype=dtype, device="cuda")
        bias = torch.ones(1, dtype=dtype, device="cuda")
        quantiles = [0.5, 0.2, 0.8]
        if provider == "naive":
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: polynorm_naive(x, weight, bias),
                quantiles=quantiles,
            )
        else:
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: polynorm_vllm(x, weight, bias),
                quantiles=quantiles,
            )
        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
    return benchmark
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--batch-size",
        type=int,
        default=4,
        help="Batch size",
    )
    parser.add_argument(
        "--seq-len",
        type=int,
        default=128,
        help="Sequence length",
    )
    parser.add_argument(
        "--hidden-dim",
        type=int,
        default=8192,
        help="Intermediate size of MLP",
    )
    parser.add_argument(
        "--save-path",
        type=str,
        default="./configs/polnorm/",
        help="Path to save polnorm benchmark results",
    )
    args = parser.parse_args()
    # Run correctness test
    calculate_diff(
        batch_size=args.batch_size,
        seq_len=args.seq_len,
        hidden_dim=args.hidden_dim,
    )
    benchmark = get_benchmark()
    # Run performance benchmark
    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -7,7 +7,8 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -9,9 +9,9 @@ from tabulate import tabulate
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -12,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
 )
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    FlexibleArgumentParser,
    create_kv_caches_with_random_flash,
 )
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -1251,7 +1251,7 @@ async def main() -> None:
        default=None,
        help="The model name used in the API. "
        "If not specified, the model name will be the "
-        "same as the ``--model`` argument. ",
+        "same as the `--model` argument. ",
    )
    parser.add_argument(
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -188,16 +188,47 @@ else()
    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()
 #
 # Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
 # Flag to enable ACL kernels for AARCH64 platforms
 if (VLLM_BUILD_ACL STREQUAL "ON")
    set(USE_ACL ON)
 else()
    set(USE_ACL OFF)
 endif()
 # Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
    # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
    # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
    if(ASIMD_FOUND)
        if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
            message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
        else()
            message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
            FetchContent_Populate(arm_compute
                SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
                SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
                GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
                GIT_TAG        v52.2.0
                GIT_SHALLOW    TRUE
                GIT_PROGRESS   TRUE
            )
            set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
        endif()
        # Build ACL with scons
        include(ProcessorCount)
        ProcessorCount(_NPROC)
        execute_process(
            COMMAND scons -j${_NPROC}
                    Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
                    arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
                    multi_isa=1 openmp=1 cppthreads=0
            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
            RESULT_VARIABLE _acl_rc
        )
        if(NOT _acl_rc EQUAL 0)
            message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
        add_compile_definitions(VLLM_USE_ACL)
    endif()
    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
@ -217,16 +248,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        )
    endif()
    if(USE_ACL)
        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
        if(NOT ARM_COMPUTE_LIBRARY)
            message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
        add_compile_definitions(VLLM_USE_ACL)
    endif()
    set(ONEDNN_LIBRARY_TYPE "STATIC")
    set(ONEDNN_BUILD_DOC "OFF")
    set(ONEDNN_BUILD_EXAMPLES "OFF")
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@ -19,7 +19,7 @@ else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
+        GIT_TAG 28417e516fcbf6257a422ba117ef5b6f44da5682
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS)
        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
    )
    set(FlashMLA_INCLUDES
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
+          GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -125,32 +125,37 @@ public:
  }
  static void set_split_kv (KernelArguments& args) {
    // printf("set_split_kv start");
    if (args.split_kv >= 1) return;
    auto [H, K, D, B] = args.problem_shape;
    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
    int sm_count = args.hw_info.sm_count;
-    // printf("    sm_count = %d\n", sm_count);
+    float seq_length_k = static_cast<float>(K) / 1024.0f;
-    int max_splits = ceil_div(K, 128);
+    int max_splits = 1;
    max_splits = min(16, max_splits);
-    // TODO: This avoids a hang when the batch size larger than 1 and 
+    if (B <= 4 && seq_length_k >= 16) {
-    // there is more than 1 kv_splits. 
+      max_splits = 16;
    // Discuss with NVIDIA how this can be fixed.
    if (B > 1) {
      max_splits = min(1, max_splits);
    }
-    
+    else if (B <= 8 && seq_length_k >= 4) {
-    // printf("    max_splits = %d\n", max_splits);
+      max_splits = 8;
    }
    else if ((B <= 16 && seq_length_k >= 8) ||
             (B == 48 && seq_length_k >= 32)) {
      max_splits = 4;
    }
    else if ((B <= 32 && seq_length_k >= 16) ||
             (B == 96 && seq_length_k >= 16)) {
      max_splits = 2;
    }
    else {
      max_splits = 1;
    }
    // Wave-aware scheduling: ensure integer number of waves in K dimension
    int sms_per_batch = max(1, sm_count / B);
    // printf("    sms_per_batch = %d\n", sms_per_batch);
    int split_heur = min(max_splits, sms_per_batch);
    int waves = ceil_div(B * split_heur, sm_count);
    int k_waves = ceil_div(max_splits, split_heur);
    int split_wave_aware = ceil_div(max_splits, k_waves);
    args.split_kv = split_wave_aware;
    // printf("    args.split_kv = %d\n", args.split_kv);
  }
  /// Determines whether the GEMM can execute the given problem.
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@ -5,11 +5,11 @@
 namespace vllm {
-// vllm_kernel_override_batch_invariant(); returns true
+// vllm_is_batch_invariant(); returns true
-// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
+// if env VLLM_BATCH_INVARIANT=1
-inline bool vllm_kernel_override_batch_invariant() {
+inline bool vllm_is_batch_invariant() {
  static bool cached = []() {
-    std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
+    std::string env_key = "VLLM_BATCH_INVARIANT";
    const char* val = std::getenv(env_key.c_str());
    return (val && std::atoi(val) != 0) ? 1 : 0;
  }();
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -148,211 +148,6 @@ fused_add_rms_norm_kernel(
  }
 }
 /* Function specialization in the case of FP16/BF16 tensors.
   Additional optimizations we can make in this case are
   packed and vectorized operations, which help with the
   memory latency bottleneck.
   _f16VecPN struct extends _f16Vec to add operations specifically required for
   polynomial normalization (poly norm).
   The original _f16Vec does not include the sum-of-powers computation or
   in-place polynomial normalization logic. */
 template <typename scalar_t, int width>
 struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
  using Base = _f16Vec<scalar_t, width>;
  using Converter = typename Base::Converter;
  using T1 = typename Base::T1;
  using T2 = typename Base::T2;
  using Base::data;
  __device__ auto sum_pows() const {
    float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
 #pragma unroll
    for (int i = 0; i < width; i += 2) {
      float2 z = Converter::convert(T2{data[i], data[i + 1]});
      float x2 = z.x * z.x;
      float x4 = x2 * x2;
      float x6 = x4 * x2;
      float y2 = z.y * z.y;
      float y4 = y2 * y2;
      float y6 = y4 * y2;
      s2 += x2 + y2;
      s4 += x4 + y4;
      s6 += x6 + y6;
    }
    return std::make_tuple(s2, s4, s6);
  }
  __device__ void poly_norm_inplace(const float w2_inv_std,
                                    const float w1_inv_std2,
                                    const float w0_inv_std3, const float bias) {
 #pragma unroll
    for (int i = 0; i < width; i += 2) {
      float2 z = Converter::convert(T2{data[i], data[i + 1]});
      float x2 = z.x * z.x;
      float x3 = x2 * z.x;
      z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
      float y2 = z.y * z.y;
      float y3 = y2 * z.y;
      z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
      auto out = Converter::convert(z);
      data[i] = out.x;
      data[i + 1] = out.y;
    }
  }
 };
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
 poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
                 const scalar_t* __restrict__ input,   // [..., hidden_size]
                 const scalar_t* __restrict__ weight,  // [3]
                 const scalar_t* __restrict__ bias,    // [1]
                 const float epsilon, const int hidden_size) {
  // Sanity checks on our vector struct and type-punned pointer arithmetic
  static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
  static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
  /* These and the argument pointers are all declared `restrict` as they are
     not aliased in practice. Argument pointers should not be dereferenced
     in this kernel as that would be undefined behavior */
  auto* __restrict__ input_v =
      reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
  const int vec_hidden_size = hidden_size / width;
  float variance = 0.0f;
  float variance2 = 0.0f;
  float variance3 = 0.0f;
  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
    int id = blockIdx.x * vec_hidden_size + idx;
    _f16VecPN<scalar_t, width> temp = input_v[id];
    auto [x2, x4, x6] = temp.sum_pows();
    variance += x2;
    variance2 += x4;
    variance3 += x6;
  }
  float3 thread_variances = make_float3(variance, variance2, variance3);
  struct SumOp {
    __device__ float3 operator()(const float3& a, const float3& b) const {
      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
    }
  };
  using BlockReduce = cub::BlockReduce<float3, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
  float3 block_variances =
      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
  variance = block_variances.x;
  variance2 = block_variances.y;
  variance3 = block_variances.z;
  __shared__ float s_w2_inv_std;
  __shared__ float s_w1_inv_std2;
  __shared__ float s_w0_inv_std3;
  __shared__ float s_bias;
  if (threadIdx.x == 0) {
    float w0 = (float)weight[0];
    float w1 = (float)weight[1];
    float w2 = (float)weight[2];
    s_bias = (float)bias[0];
    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
  }
  __syncthreads();
  auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
    int id = blockIdx.x * vec_hidden_size + idx;
    _f16VecPN<scalar_t, width> temp = input_v[id];
    temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
    out_v[id] = temp;
  }
 }
 /* Generic poly_norm_kernel
   The width field is not used here but necessary for other specializations.
 */
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
 poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
                 const scalar_t* __restrict__ input,   // [..., hidden_size]
                 const scalar_t* __restrict__ weight,  // [3]
                 const scalar_t* __restrict__ bias,    // [1]
                 const float epsilon, const int hidden_size) {
  float variance = 0.0f;
  float variance2 = 0.0f;
  float variance3 = 0.0f;
  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
    float x = (float)input[blockIdx.x * hidden_size + idx];
    float x2 = x * x;
    float x4 = x2 * x2;
    float x6 = x4 * x2;
    variance += x2;
    variance2 += x4;
    variance3 += x6;
  }
  float3 thread_variances = make_float3(variance, variance2, variance3);
  struct SumOp {
    __device__ float3 operator()(const float3& a, const float3& b) const {
      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
    }
  };
  using BlockReduce = cub::BlockReduce<float3, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
  float3 block_variances =
      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
  variance = block_variances.x;
  variance2 = block_variances.y;
  variance3 = block_variances.z;
  __shared__ float s_w2_inv_std;
  __shared__ float s_w1_inv_std2;
  __shared__ float s_w0_inv_std3;
  __shared__ float s_bias;
  if (threadIdx.x == 0) {
    float w0 = (float)weight[0];
    float w1 = (float)weight[1];
    float w2 = (float)weight[2];
    s_bias = (float)bias[0];
    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
  }
  __syncthreads();
  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
    float x = (float)input[blockIdx.x * hidden_size + idx];
    float x2 = x * x;
    float x3 = x2 * x;
    out[blockIdx.x * hidden_size + idx] =
        (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
                   s_bias);
  }
 }
 }  // namespace vllm
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
@ -364,18 +159,26 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
  TORCH_CHECK(weight.is_contiguous());
  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
+
-  int64_t input_stride = input.stride(-2);
+  // We cannot just use `input.stride(-2)` if the tensor is not row-major.
  // Instead, we use a 2d view to get the second-innermost stride.
  // That way the dimensions (except the last one) can be arbitrarily permuted.
  torch::Tensor input_view = input.view({-1, hidden_size});
  int num_tokens = input_view.numel() / hidden_size;
  int64_t input_stride = input_view.stride(-2);
  dim3 grid(num_tokens);
  dim3 block(std::min(hidden_size, 1024));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+  VLLM_DISPATCH_FLOATING_TYPES(
-    vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
+      input_view.scalar_type(), "rms_norm_kernel", [&] {
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
+        vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
-        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
+            out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
-  });
+            input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
            hidden_size);
      });
 }
 #define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
@ -392,6 +195,8 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                        torch::Tensor& residual,  // [..., hidden_size]
                        torch::Tensor& weight,    // [hidden_size]
                        double epsilon) {
  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
  TORCH_CHECK(input.scalar_type() == residual.scalar_type());
  TORCH_CHECK(residual.is_contiguous());
  TORCH_CHECK(weight.is_contiguous());
  int hidden_size = input.size(-1);
@ -426,7 +231,7 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                          wt_ptr % req_alignment_bytes == 0;
  bool offsets_are_multiple_of_vector_width =
      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
@ -434,50 +239,3 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
    LAUNCH_FUSED_ADD_RMS_NORM(0);
  }
 }
 #define LAUNCH_FUSED_POLY_NORM(width)                                         \
  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
    vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>(      \
        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),                 \
        weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon,      \
        hidden_size);                                                         \
  });
 void poly_norm(torch::Tensor& out,     // [..., hidden_size]
               torch::Tensor& input,   // [..., hidden_size]
               torch::Tensor& weight,  // [3]
               torch::Tensor& bias,    // [1]
               double epsilon) {
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.data_ptr() != input.data_ptr());
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;
  dim3 grid(num_tokens);
  /* This kernel is memory-latency bound in many scenarios.
     When num_tokens is large, a smaller block size allows
     for increased block occupancy on CUs and better latency
     hiding on global mem ops. */
  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
  dim3 block(std::min(hidden_size, max_block_size));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  /*If the tensor types are FP16/BF16, try to use the optimized kernel
    with packed + vectorized ops.
    Max optimization is achieved with a width-8 vector of FP16/BF16s
    since we can load at most 128 bits at once in a global memory op.
    However, this requires each tensor's data to be aligned to 16
    bytes.
   */
  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
    LAUNCH_FUSED_POLY_NORM(8);
  } else {
    LAUNCH_FUSED_POLY_NORM(0);
  }
 }
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -229,6 +229,8 @@ void fused_add_rms_norm_static_fp8_quant(
    double epsilon) {
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(residual.is_contiguous());
  TORCH_CHECK(residual.scalar_type() == input.scalar_type());
  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
  int hidden_size = input.size(-1);
  int input_stride = input.stride(-2);
  int num_tokens = input.numel() / hidden_size;
@ -254,7 +256,7 @@ void fused_add_rms_norm_static_fp8_quant(
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
  bool ptrs_are_aligned =
      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -8,12 +8,77 @@
 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
 #include "core/math.hpp"
 #define CEILDIV(x, y) (((x) + (y) - 1) / (y))
 namespace vllm {
 namespace moe {
 namespace batched_moe_align_block_size {
 // Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
 static constexpr int32_t num_threads = 1024;
 static constexpr int32_t num_blocks = 1;
 __global__ void batched_moe_align_block_size_kernel(
    int32_t const num_batches, int32_t const max_tokens_per_batch,
    int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
    int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
    int32_t* __restrict__ num_tokens_post_pad) {
  // TODO(varun): This is a naive implementation. Could be optimized.
  size_t const batch_id = threadIdx.x;
  size_t const stride = blockDim.x * gridDim.x;
  int32_t const num_blocks_per_batch =
      CEILDIV(max_tokens_per_batch, block_size);
  int32_t const sorted_ids_size =
      num_blocks_per_batch * num_batches * block_size;
  int32_t const block_ids_size = sorted_ids_size / block_size;
  int32_t const SENTINEL =
      num_batches * max_tokens_per_batch;  // To denote invalid entries.
  // Intialize sorted_ids
  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
    sorted_ids[i] = SENTINEL;
  }
  // Intialize expert_ids with -1
  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
    block_ids[i] = -1;
  }
  int32_t b_num_tokens = 0;
  if (batch_id < num_batches) {
    b_num_tokens = batch_num_tokens[batch_id];
  }
  int32_t const ceil_b_num_tokens =
      CEILDIV(b_num_tokens, block_size) * block_size;
  // Compute prefix sum over token counts per expert
  using BlockScan = cub::BlockScan<int32_t, 1024>;
  __shared__ typename BlockScan::TempStorage temp_storage;
  int cumsum_val;
  BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
  __syncthreads();
  bool const is_last_batch = batch_id == (num_batches - 1);
  if (is_last_batch) {
    *num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
  }
  if (batch_id < num_batches) {
    int32_t const batch_offset = batch_id * max_tokens_per_batch;
    for (size_t i = 0; i < b_num_tokens; ++i) {
      sorted_ids[cumsum_val + i] = batch_offset + i;
    }
    int32_t const block_start = cumsum_val / block_size;
    int32_t const num_blocks = ceil_b_num_tokens / block_size;
    for (size_t i = 0; i < num_blocks; ++i) {
      block_ids[block_start + i] = batch_id;
    }
  }
 }
 }  // namespace batched_moe_align_block_size
 template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(
    const scalar_t* __restrict__ topk_ids,
@ -280,6 +345,33 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
      });
 }
 void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                  int64_t block_size,
                                  torch::Tensor const& batch_num_tokens,
                                  torch::Tensor sorted_ids,
                                  torch::Tensor batch_ids,
                                  torch::Tensor num_tokens_post_pad) {
  namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  int32_t const B = batch_num_tokens.size(0);
  int32_t const num_blocks_per_batch =
      round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
  int32_t const num_blocks = num_blocks_per_batch * B;
  int64_t const sorted_ids_size = num_blocks * block_size;
  TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
  TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
  TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
  TORCH_CHECK(B <= batched_kernel::num_threads);
  batched_kernel::batched_moe_align_block_size_kernel<<<
      batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
      B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
      sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
      num_tokens_post_pad.data_ptr<int32_t>());
 }
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
             torch::Tensor& output)  // [num_tokens, hidden_size]
 {
--- a/csrc/moe/moe_lora_align_sum_kernels.cu
+++ b/csrc/moe/moe_lora_align_sum_kernels.cu
@ -0,0 +1,169 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/ATen.h>
 #include <ATen/cuda/Atomic.cuh>
 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
 #include "core/math.hpp"
 namespace {
 __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
                                         int32_t col) {
  return row * total_col + col;
 }
 }  // namespace
 // TODO: Refactor common parts with moe_align_sum_kernels
 template <typename scalar_t, typename token_cnts_t>
 __global__ void moe_lora_align_sum_kernel(
    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
    int64_t block_size, int num_experts, int max_loras, size_t numel,
    int max_num_tokens_padded, int max_num_m_blocks,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
    int topk_num, int32_t* total_tokens_post_pad) {
  const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
  const size_t start_idx = threadIdx.x * tokens_per_thread;
  int lora_id = blockIdx.x;
  extern __shared__ int32_t shared_mem[];
  int32_t* cumsum = shared_mem;
  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
  // Initialize sorted_token_ids with numel
  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
    sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
  }
  // Initialize expert_ids with -1
  for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
    expert_ids[lora_id * max_num_m_blocks + it] = -1;
  }
  // Initialize total_tokens_post_pad with 0
  if (threadIdx.x == 0) {
    total_tokens_post_pad[lora_id] = 0;
  }
  for (int i = 0; i < num_experts; ++i) {
    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
  }
  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
    int mask = token_lora_mapping[i / topk_num] == lora_id;
    int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
    tokens_cnts[idx] += mask;
  }
  __syncthreads();
  // For each expert we accumulate the token counts from the different threads.
  if (threadIdx.x < num_experts) {
    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
    for (int i = 1; i <= blockDim.x; ++i) {
      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
    }
  }
  __syncthreads();
  // We accumulate the token counts of all experts in thread 0.
  if (threadIdx.x == 0) {
    cumsum[0] = 0;
    for (int i = 1; i <= num_experts; ++i) {
      cumsum[i] = cumsum[i - 1] +
                  div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
                           block_size) *
                      block_size;
    }
    total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
  }
  __syncthreads();
  /**
   * For each expert, each thread processes the tokens of the corresponding
   * blocks and stores the corresponding expert_id for each block.
   */
  if (threadIdx.x < num_experts) {
    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
         i += block_size) {
      expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
          threadIdx.x;
    }
  }
  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
    int32_t expert_id = topk_ids[i];
    /** The cumsum[expert_id] stores the starting index of the tokens that the
     * expert with expert_id needs to process, and
     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
     * processed by the expert with expert_id within the current thread's token
     * shard.
     */
    int32_t rank_post_pad =
        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
        cumsum[expert_id];
    int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
    atomicAdd(
        &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
        (i - numel) * mask);
    tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
  }
 }
 void moe_lora_align_block_size(torch::Tensor topk_ids,
                               torch::Tensor token_lora_mapping,
                               int64_t num_experts, int64_t block_size,
                               int64_t max_loras, int64_t max_num_tokens_padded,
                               int64_t max_num_m_blocks,
                               torch::Tensor sorted_token_ids,
                               torch::Tensor expert_ids,
                               torch::Tensor num_tokens_post_pad) {
  const int topk_num = topk_ids.size(1);
  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
  int device_max_shared_mem;
  auto dev = topk_ids.get_device();
  cudaDeviceGetAttribute(&device_max_shared_mem,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  const int32_t num_thread = max((int32_t)num_experts, 128);  // WARP_SIZE,
  TORCH_CHECK(num_thread <= 1024,
              "num_thread must be less than 1024, "
              "and fallback is not implemented yet.");
  const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
                             (num_experts + 1) * sizeof(int32_t);
  if (shared_mem > device_max_shared_mem) {
    TORCH_CHECK(false,
                "Shared memory usage exceeds device limit, and global memory "
                "fallback is not implemented yet.");
  }
  VLLM_DISPATCH_INTEGRAL_TYPES(
      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
        dim3 blockDim(num_thread);
        auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
            (void*)kernel, shared_mem));
        kernel<<<max_loras, blockDim, shared_mem, stream>>>(
            topk_ids.data_ptr<scalar_t>(),
            token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
            max_loras, topk_ids.numel(), max_num_tokens_padded,
            max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
            expert_ids.data_ptr<int32_t>(), topk_num,
            num_tokens_post_pad.data_ptr<int32_t>());
      });
 }
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -4,7 +4,7 @@
 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
-                  torch::Tensor& gating_output);
+                  torch::Tensor& gating_output, bool renormalize);
 void moe_sum(torch::Tensor& input, torch::Tensor& output);
@ -12,6 +12,22 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);
 void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                  int64_t block_size,
                                  torch::Tensor const& expert_num_tokens,
                                  torch::Tensor sorted_ids,
                                  torch::Tensor expert_ids,
                                  torch::Tensor num_tokens_post_pad);
 void moe_lora_align_block_size(torch::Tensor topk_ids,
                               torch::Tensor token_lora_mapping,
                               int64_t num_experts, int64_t block_size,
                               int64_t max_loras, int64_t max_num_tokens_padded,
                               int64_t max_num_m_blocks,
                               torch::Tensor sorted_token_ids,
                               torch::Tensor expert_ids,
                               torch::Tensor num_tokens_post_pad);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor b_qweight, torch::Tensor b_scales,
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -16,12 +16,23 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <type_traits>
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"
 #ifndef USE_ROCM
    #include <cuda_bf16.h>
    #include <cuda_fp16.h>
 #else
    #include <hip/hip_bf16.h>
    #include <hip/hip_fp16.h>
    typedef __hip_bfloat16 __nv_bfloat16;
    typedef __hip_bfloat162 __nv_bfloat162;
 #endif
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -36,16 +47,27 @@ template <
    /// Alignment requirement in bytes
    int Alignment = sizeof(T) * N
 >
-class alignas(Alignment) AlignedArray {
+struct alignas(Alignment) AlignedArray {
-    float data[N];
+    T data[N];
 };
 template <typename T>
 __device__ __forceinline__ float toFloat(T value) {
    if constexpr (std::is_same_v<T, float>) {
        return value;
    } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
        return __bfloat162float(value);
    } else if constexpr (std::is_same_v<T, __half>) {
        return __half2float(value);
    }
 }
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
-template <int TPB>
+template <int TPB, typename InputType>
 __launch_bounds__(TPB) __global__
-    void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
+    void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols)
 {
    using BlockReduce = cub::BlockReduce<float, TPB>;
    __shared__ typename BlockReduce::TempStorage tmpStorage;
@ -66,7 +88,8 @@ __launch_bounds__(TPB) __global__
    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
    {
        const int idx = thread_row_offset + ii;
-        threadData = max(static_cast<float>(input[idx]), threadData);
+        const float val = toFloat(input[idx]);
        threadData = max(val, threadData);
    }
    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
@ -81,7 +104,8 @@ __launch_bounds__(TPB) __global__
    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
    {
        const int idx = thread_row_offset + ii;
-        threadData += exp((static_cast<float>(input[idx]) - float_max));
+        const float val = toFloat(input[idx]);
        threadData += expf(val - float_max);
    }
    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
@ -95,8 +119,9 @@ __launch_bounds__(TPB) __global__
    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
    {
        const int idx = thread_row_offset + ii;
-        const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
+        const float val = toFloat(input[idx]);
-        output[idx] = val;
+        const float softmax_val = expf(val - float_max) * normalizing_factor;
        output[idx] = softmax_val;
    }
 }
@ -110,7 +135,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
    const int num_experts,
    const int k,
    const int start_expert,
-    const int end_expert)
+    const int end_expert,
    const bool renormalize)
 {
    using cub_kvp = cub::KeyValuePair<int, float>;
@ -125,6 +151,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
    const bool row_is_active = finished ? !finished[block_row] : true;
    const int thread_read_offset = blockIdx.x * num_experts;
    float selected_sum = 0.f;
    for (int k_idx = 0; k_idx < k; ++k_idx)
    {
        thread_kvp.key = 0;
@ -163,9 +190,23 @@ __launch_bounds__(TPB) __global__ void moeTopK(
            indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
            assert(indices[idx] >= 0);
            source_rows[idx] = k_idx * num_rows + block_row;
            if (renormalize) {
                selected_sum += result_kvp.value;
            }
        }
        __syncthreads();
    }
    // Renormalize the k weights for this row to sum to 1, if requested.
    if (renormalize) {
        if (threadIdx.x == 0) {
            const float denom = selected_sum > 0.f ? selected_sum : 1.f;
            for (int k_idx = 0; k_idx < k; ++k_idx) {
                const int idx = k * block_row + k_idx;
                output[idx] = output[idx] / denom;
            }
        }
    }
 }
 // ====================== TopK softmax things ===============================
@ -184,21 +225,30 @@ __launch_bounds__(TPB) __global__ void moeTopK(
  2) This implementation assumes k is small, but will work for any k.
 */
-template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
-    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
+    void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
-        int* source_rows, const int k, const int start_expert, const int end_expert)
+        int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
 {
    static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
                      std::is_same_v<InputType, __half>,
                  "InputType must be float, __nv_bfloat16, or __half");
    // We begin by enforcing compile time assertions and setting up compile time constants.
    static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
    static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
    // Number of bytes each thread pulls in per load
-    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
    static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
    static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
    static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
    if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
        static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
            "ELTS_PER_LDG must be 1 or even for 16-bit conversion");
    }
    // Restrictions based on previous section.
    static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
    static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
@ -236,27 +286,71 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
    // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
    // row it will read.
-    const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
+    const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
    // Now, we compute the group each thread belong to in order to determine the first column to start loads.
    const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
    const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
-    const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
+    const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
    // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
    // this can support all powers of 2 up to 16.
    // NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
    // We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
    using AccessType = AlignedArray<float, ELTS_PER_LDG>;
    // Finally, we pull in the data from global mem
    float row_chunk[VPT];
-    AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
+
-    const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
+    // NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float
    if constexpr (std::is_same_v<InputType, float>) {
        using VecType = AlignedArray<float, ELTS_PER_LDG>;
        VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
        const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
 #pragma unroll
-    for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
+        for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
-    {
+            row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
-        row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+        }
    } else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
        if constexpr (ELTS_PER_LDG >= 2) {
            using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
            float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
            const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
 #pragma unroll
            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
                VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
                int base_idx_f2 = ii * ELTS_PER_LDG / 2;
 #pragma unroll
                for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
                    row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
                        *reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
                    );
                }
            }
        } else { // ELTS_PER_LDG == 1
 #pragma unroll
            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
                const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
                row_chunk[ii] = __bfloat162float(*scalar_ptr);
            }
        }
    } else if constexpr (std::is_same_v<InputType, __half>) {
        if constexpr (ELTS_PER_LDG >= 2) {
            using VecType = AlignedArray<__half, ELTS_PER_LDG>;
            float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
            const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
 #pragma unroll
            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
                VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
                int base_idx_f2 = ii * ELTS_PER_LDG / 2;
 #pragma unroll
                for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
                    row_chunk_f2[base_idx_f2 + jj] = __half22float2(
                        *reinterpret_cast<const __half2*>(vec.data + jj * 2)
                    );
                }
            }
        } else { // ELTS_PER_LDG == 1
 #pragma unroll
            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
                const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
                row_chunk[ii] = __half2float(*scalar_ptr);
            }
        }
    }
    // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
@ -310,6 +404,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
    int start_col = first_elt_read_by_thread;
    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
    float selected_sum = 0.f;
    for (int k_idx = 0; k_idx < k; ++k_idx)
    {
        // First, each thread does the local argmax
@ -363,6 +458,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
            output[idx] = max_val;
            indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
            source_rows[idx] = k_idx * num_rows + thread_row;
            if (renormalize) {
                selected_sum += max_val;
            }
        }
        // Finally, we clear the value in the thread with the current max if there is another iteration to run.
@ -380,15 +478,28 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
            }
        }
    }
    // Renormalize the k weights for this row to sum to 1, if requested.
    if (renormalize) {
        if (thread_group_idx == 0)
        {
            const float denom = selected_sum > 0.f ? selected_sum : 1.f;
            for (int k_idx = 0; k_idx < k; ++k_idx)
            {
                const int idx = k * thread_row + k_idx;
                output[idx] = output[idx] / denom;
            }
        }
    }
 }
 namespace detail
 {
 // Constructs some constants needed to partition the work across threads at compile time.
-template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
+template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType>
 struct TopkConstants
 {
-    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
    static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
    static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
@ -397,20 +508,21 @@ struct TopkConstants
 };
 } // namespace detail
-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
+template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
-void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
+void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
-    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
+    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
    cudaStream_t stream)
 {
-    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
+    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
-    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
+    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
-    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
+    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
-        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
+        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
 }
 #ifndef USE_ROCM
@ -418,26 +530,26 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
    static_assert(WARP_SIZE == 32,                                                    \
                  "Unsupported warp size. Only 32 is supported for CUDA");            \
    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
-        gating_output, nullptr, topk_weights, topk_indices,                           \
+        gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,     \
-        token_expert_indices, num_tokens, topk, 0, num_experts, stream);
+        num_tokens, topk, 0, num_experts, renormalize, stream);
 #else
 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
    if (WARP_SIZE == 64) {                                                               \
        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices,                          \
+            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
-            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
    } else if (WARP_SIZE == 32) {                                                        \
        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices,                          \
+            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
-            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
    } else {                                                                             \
        assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
    }
 #endif
-template <typename IndType>
+template <typename IndType, typename InputType>
 void topkGatingSoftmaxKernelLauncher(
-    const float* gating_output,
+    const InputType* gating_output,
    float* topk_weights,
    IndType* topk_indices,
    int* token_expert_indices,
@ -445,11 +557,15 @@ void topkGatingSoftmaxKernelLauncher(
    const int num_tokens,
    const int num_experts,
    const int topk,
    const bool renormalize,
    cudaStream_t stream) {
    static constexpr int WARPS_PER_TB = 4;
    static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
 #ifndef USE_ROCM
-    static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
+    // for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
    // elements can be loaded by a warp
    static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
    (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
 #endif
    switch (num_experts) {
        case 1:
@ -506,11 +622,11 @@ void topkGatingSoftmaxKernelLauncher(
            TORCH_CHECK(softmax_workspace != nullptr,
                "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
            static constexpr int TPB = 256;
-            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
+            moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
                gating_output, nullptr, softmax_workspace, num_experts);
            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
-                num_experts, topk, 0, num_experts);
+                num_experts, topk, 0, num_experts, renormalize);
        }
    }
 }
@ -518,11 +634,50 @@ void topkGatingSoftmaxKernelLauncher(
 } // namespace moe
 } // namespace vllm
 template<typename ComputeType>
 void dispatch_topk_softmax_launch(
    torch::Tensor& gating_output,
    torch::Tensor& topk_weights,
    torch::Tensor& topk_indices,
    torch::Tensor& token_expert_indices,
    torch::Tensor& softmax_workspace,
    int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
 {
    if (topk_indices.scalar_type() == at::ScalarType::Int) {
        vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<int>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
            num_tokens, num_experts, topk, renormalize, stream);
    } else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
        vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<uint32_t>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
            num_tokens, num_experts, topk, renormalize, stream);
    } else {
        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
        vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<int64_t>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
            num_tokens, num_experts, topk, renormalize, stream);
    }
 }
 void topk_softmax(
    torch::Tensor& topk_weights,                // [num_tokens, topk]
    torch::Tensor& topk_indices,                // [num_tokens, topk]
    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
-    torch::Tensor& gating_output)               // [num_tokens, num_experts]
+    torch::Tensor& gating_output,               // [num_tokens, num_experts]
    bool renormalize)
 {
    const int num_experts = gating_output.size(-1);
    const auto num_tokens = gating_output.numel() / num_experts;
@ -534,45 +689,19 @@ void topk_softmax(
    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
+    const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
    torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
-    if(topk_indices.scalar_type() == at::ScalarType::Int)
+    if (gating_output.scalar_type() == at::ScalarType::Float) {
-    {
+        dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices, 
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
-            gating_output.data_ptr<float>(),
+    } else if (gating_output.scalar_type() == at::ScalarType::Half) {
-            topk_weights.data_ptr<float>(),
+        dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices, 
-            topk_indices.data_ptr<int>(),
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
-            token_expert_indices.data_ptr<int>(),
+    } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
-            softmax_workspace.data_ptr<float>(),
+        dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices, 
-            num_tokens,
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
-            num_experts,
+    } else {
-            topk,
+        TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
            stream);
    }
    else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
    {
        vllm::moe::topkGatingSoftmaxKernelLauncher(
            gating_output.data_ptr<float>(),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<uint32_t>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
            num_tokens,
            num_experts,
            topk,
            stream);
    }
    else {
        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
        vllm::moe::topkGatingSoftmaxKernelLauncher(
            gating_output.data_ptr<float>(),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<int64_t>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
            num_tokens,
            num_experts,
            topk,
            stream);
    }
 }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  // Apply topk softmax to the gating outputs.
  m.def(
      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
-      "token_expert_indices, Tensor gating_output) -> ()");
+      "token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
  // Calculate the result of moe by summing up the partial results
@ -22,6 +22,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
  // Aligning the number of tokens to be processed by each expert such
  // that it is divisible by the block size, but for the batched case.
  m.def(
      "batched_moe_align_block_size(int max_tokens_per_batch,"
      "                     int block_size, Tensor expert_num_tokens,"
      "                     Tensor! sorted_token_ids,"
      "                     Tensor! experts_ids,"
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("batched_moe_align_block_size", torch::kCUDA,
         &batched_moe_align_block_size);
  // Aligning the number of tokens to be processed by each expert such
  // that it is divisible by the block size.
  m.def(
      "moe_lora_align_block_size(Tensor topk_ids,"
      "                     Tensor token_lora_mapping,"
      "                     int num_experts,"
      "                     int block_size, int max_loras, "
      "                     int max_num_tokens_padded, "
      "                     int max_num_m_blocks, "
      "                     Tensor !sorted_token_ids,"
      "                     Tensor !experts_ids,"
      "                     Tensor !num_tokens_post_pad) -> () ");
  m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
 #ifndef USE_ROCM
  m.def(
      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -92,9 +92,6 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);
 void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
               torch::Tensor& bias, double epsilon);
 void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& prompt_mask,
                                 const torch::Tensor& output_mask,
@ -102,8 +99,11 @@ void apply_repetition_penalties_(torch::Tensor& logits,
 void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
-                   torch::Tensor& values, int64_t numRows, int64_t stride0,
+                   int64_t numRows, int64_t stride0, int64_t stride1);
-                   int64_t stride1);
+
 void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
                          const torch::Tensor& seq_lens, torch::Tensor& indices,
                          int64_t numRows, int64_t stride0, int64_t stride1);
 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                               torch::Tensor& weight, torch::Tensor& scale,
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@ -145,7 +145,11 @@ void rms_norm_dynamic_per_token_quant(
  if (scale_ub.has_value()) {
    TORCH_CHECK(out.dtype() == kFp8Type);
  }
  TORCH_CHECK(weight.dtype() == input.dtype());
  TORCH_CHECK(scales.dtype() == torch::kFloat32);
  if (residual) {
    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
  }
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@ -54,15 +54,10 @@ static inline __device__ uint16_t extractBinIdx(float x) {
  return 511 - (tmp.u16 >> 7);
 }
-template <int kNumThreadsPerBlock = 512>
+template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
-static __global__ void topKPerRow(const float* logits, const int* rowStarts,
+__device__ void topKPerRowJob(const float* logits, const int rowStart,
-                                  const int* rowEnds, int* outIndices,
+                              const int rowEnd, const int rowIdx,
-                                  float* outLogits, int stride0, int stride1) {
+                              int* outIndices, int stride0, int stride1) {
  // The number of bins in the histogram.
  static constexpr int kNumBins = 512;
  // The top-k width.
  static constexpr int kTopK = 2048;
  // The number of elements per thread for the final top-k sort.
  static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
  // The class to sort the elements during the final top-k sort.
@ -103,17 +98,11 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
  __shared__ int smemHistogram[kNumBins];
  // Shared memory to store the selected indices.
  __shared__ int smemIndices[kTopK];
  // Shared memory to store the selected logits.
  __shared__ float smemLogits[kTopK];
  // Shared memory to store the threshold bin.
  __shared__ int smemThresholdBinIdx[1];
  // Shared memory counter to register the candidates for the final phase.
  __shared__ int smemFinalDstIdx[1];
  // The row computed by this block.
  int rowIdx = blockIdx.x;
  // The range of logits within the row.
  int rowStart = rowStarts[rowIdx], rowEnd = rowEnds[rowIdx];
  // The length of the row.
  int rowLen = rowEnd - rowStart;
@ -124,13 +113,10 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
         rowIt += kNumThreadsPerBlock) {
      int idx = rowStart + rowIt;
      outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
      outLogits[rowIdx * kTopK + rowIt] =
          logits[rowIdx * stride0 + idx * stride1];
    }
    for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
         rowIt += kNumThreadsPerBlock) {
      outIndices[rowIdx * kTopK + rowIt] = -1;
      outLogits[rowIdx * kTopK + rowIt] = -FLT_MAX;
    }
    return;
  }
@ -201,7 +187,6 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
    uint16_t idx = extractBinIdx(logit);
    if (idx < thresholdBinIdx) {
      int dstIdx = atomicAdd(&smemHistogram[idx], 1);
      smemLogits[dstIdx] = logit;
      smemIndices[dstIdx] = rowIt;
    } else if (idx == thresholdBinIdx) {
      int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
@ -250,7 +235,6 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
    int dstIdx = baseIdx + srcIdx;
    if (dstIdx < kTopK) {
      smemLogits[dstIdx] = finalLogits[ii];
      smemIndices[dstIdx] = finalIndices[ii];
    }
  }
@ -258,31 +242,58 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
  // Make sure the data is in shared memory.
  __syncthreads();
  // The topK logits.
  float topKLogits[kNumTopKItemsPerThread];
  // The topK indices.
  int topKIndices[kNumTopKItemsPerThread];
 // Load from shared memory.
 #pragma unroll
  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
    topKLogits[ii] = smemLogits[ii * kNumThreadsPerBlock + threadIdx.x];
    topKIndices[ii] = smemIndices[ii * kNumThreadsPerBlock + threadIdx.x];
  }
  // Sort the elements.
  TopKSort(smemFinal.topKSort)
      .SortDescendingBlockedToStriped(topKLogits, topKIndices);
 // Store to global memory.
 #pragma unroll
  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
    int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
-    outIndices[offset] = topKIndices[ii] - rowStart;
+    outIndices[offset] =
-    outLogits[offset] = topKLogits[ii];
+        smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
  }
 }
 template <int kNumThreadsPerBlock = 512>
 static __global__ void topKPerRow(const float* logits, const int* rowStarts,
                                  const int* rowEnds, int* outIndices,
                                  int stride0, int stride1) {
  // The number of bins in the histogram.
  static constexpr int kNumBins = 512;
  // The top-k width.
  static constexpr int kTopK = 2048;
  // The row computed by this block.
  int rowIdx = blockIdx.x;
  // The range of logits within the row.
  int rowStart = rowStarts[rowIdx];
  int rowEnd = rowEnds[rowIdx];
  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
 }
 template <int kNumThreadsPerBlock = 512>
 static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
                                        int* outIndices, int stride0,
                                        int stride1, int next_n) {
  // The number of bins in the histogram.
  static constexpr int kNumBins = 512;
  // The top-k width.
  static constexpr int kTopK = 2048;
  // The row computed by this block.
  int rowIdx = blockIdx.x;
  // The range of logits within the row.
  int rowStart = 0;
  int seq_len = seqLens[rowIdx / next_n];
  int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
 }
 }  // namespace vllm
 void apply_repetition_penalties_(
@ -326,10 +337,23 @@ void apply_repetition_penalties_(
      });
 }
 void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
                          const torch::Tensor& seqLens, torch::Tensor& indices,
                          int64_t numRows, int64_t stride0, int64_t stride1) {
  // Compute the results on the device.
  constexpr int kNumThreadsPerBlock = 512;
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  vllm::topKPerRowDecode<kNumThreadsPerBlock>
      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
          logits.data_ptr<float>(), seqLens.data_ptr<int>(),
          indices.data_ptr<int>(), static_cast<int>(stride0),
          static_cast<int>(stride1), static_cast<int>(next_n));
 }
 void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
-                   torch::Tensor& values, int64_t numRows, int64_t stride0,
+                   int64_t numRows, int64_t stride0, int64_t stride1) {
                   int64_t stride1) {
  // Compute the results on the device.
  constexpr int kNumThreadsPerBlock = 512;
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -338,6 +362,5 @@ void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
          logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
          rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
-          values.data_ptr<float>(), static_cast<int>(stride0),
+          static_cast<int>(stride0), static_cast<int>(stride1));
          static_cast<int>(stride1));
 }
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -175,12 +175,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "float epsilon) -> ()");
  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
  // Polynomial Normalization.
  ops.def(
      "poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
      "epsilon) -> ()");
  ops.impl("poly_norm", torch::kCUDA, &poly_norm);
  // Apply repetition penalties to logits in-place
  ops.def(
      "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
@ -191,10 +185,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Optimized top-k per row operation
  ops.def(
      "top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
-      "Tensor! indices, Tensor! values, int numRows, int stride0, "
+      "Tensor! indices, int numRows, int stride0, "
      "int stride1) -> ()");
  ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
  ops.def(
      "top_k_per_row_decode(Tensor logits, int next_n, "
      "Tensor seq_lens, Tensor! indices, int numRows, "
      "int stride0, int stride1) -> ()");
  ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
  // Layernorm-quant
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -5,7 +5,7 @@
 # docs/contributing/dockerfile/dockerfile.md and
 # docs/assets/contributing/dockerfile-stages-dependency.png
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.9.1
 ARG PYTHON_VERSION=3.12
 # By parameterizing the base images, we allow third-party to use their own
@ -132,7 +132,9 @@ WORKDIR /workspace
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+    # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
    uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \
    && uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 # cuda arch list used by torch
@ -273,6 +275,7 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 # TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
 ARG GDRCOPY_CUDA_VERSION=12.8
 # Keep in line with FINAL_BASE_IMAGE
 ARG GDRCOPY_OS_VERSION=Ubuntu22_04
@ -353,14 +356,23 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose \
+    # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
    uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
    && uv pip install --system dist/*.whl --verbose \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 # TODO (huydhn): Remove this once xformers is released for 2.9.0
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
    . /etc/environment
    export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
 BASH
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.4.0 \
+    uv pip install --system flashinfer-cubin==0.4.1 \
-    && uv pip install --system flashinfer-jit-cache==0.4.0 \
+    && uv pip install --system flashinfer-jit-cache==0.4.1 \
        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
    && flashinfer show-config
@ -422,6 +434,7 @@ ARG PYTHON_VERSION
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -434,7 +447,8 @@ ENV UV_LINK_MODE=copy
 RUN --mount=type=cache,target=/root/.cache/uv \
    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
    if [ "$CUDA_MAJOR" -ge 12 ]; then \
-        uv pip install --system -r requirements/dev.txt; \
+        uv pip install --system -r requirements/dev.txt \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
    fi
 # install development dependencies (for testing)
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -31,7 +31,7 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get update -y \
-    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
+    && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
    && curl -LsSf https://astral.sh/uv/install.sh | sh
@ -106,14 +106,106 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
 ARG TARGETPLATFORM
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # install build dependencies
 COPY requirements/build.txt requirements/build.txt
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
 ENV MAX_JOBS=${max_jobs}
 ARG USE_SCCACHE
 ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
 ARG SCCACHE_ENDPOINT
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 # Flag to control whether to use pre-built vLLM wheels
 ARG VLLM_USE_PRECOMPILED=""
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
        && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
        && tar -xzf sccache.tar.gz \
        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
        && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
        && export SCCACHE_IDLE_TIMEOUT=0 \
        && export CMAKE_BUILD_TYPE=Release \
        && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
        && sccache --show-stats \
        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
        && sccache --show-stats; \
    fi
 ARG vllm_target_device="cpu"
 ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git  \
    if [ "$USE_SCCACHE" != "1" ]; then \
        # Clean any existing CMake artifacts
        rm -rf .deps && \
        mkdir -p .deps && \
        export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
 ARG VLLM_MAX_SIZE_MB=450
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
        python3 check-wheel-size.py dist; \
    else \
        echo "Skipping wheel size check."; \
    fi
 ######################### TEST DEPS #########################
 FROM base AS vllm-test-deps
 WORKDIR /workspace/vllm
 # TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
    cp requirements/test.in requirements/cpu-test.in && \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 RUN --mount=type=cache,target=/root/.cache/uv \
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.4.0
+# release version: v0.4.1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
    echo "git clone flashinfer..." \
    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
-    && git checkout v0.4.0 \
+    && git checkout v0.4.1\
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
    && rm -rf build \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -1,7 +1,7 @@
 # default base image
 ARG REMOTE_VLLM="0"
 ARG COMMON_WORKDIR=/app
-ARG BASE_IMAGE=rocm/vllm-dev:base
+ARG BASE_IMAGE=rocm/vllm-dev:base_custom_1020_rc1_20251008_tuned_20251008
 FROM ${BASE_IMAGE} AS base
@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 RUN apt-get update -q -y && apt-get install -q -y \
    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
    apt-transport-https ca-certificates wget curl
-# Remove sccache    
+# Remove sccache
 RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 ARG COMMON_WORKDIR
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,13 +1,13 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
-ARG TRITON_BRANCH="f9e5bf54"
+ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
-ARG PYTORCH_BRANCH="b2fb6885"
+ARG PYTORCH_BRANCH="1c57644d"
 ARG PYTORCH_VISION_BRANCH="v0.23.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="2ab9f4cd"
+ARG AITER_BRANCH="eef23c7f"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 FROM ${BASE_IMAGE} AS base
--- a/docs/api/README.md
+++ b/docs/api/README.md
@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.CompilationConfig][]
 - [vllm.config.VllmConfig][]
 [](){ #offline-inference-api }
 ## Offline Inference
 LLM Class.
@ -45,18 +43,14 @@ Engine classes for offline and online inference.
 Inference parameters for vLLM APIs.
 [](){ #sampling-params }
 - [vllm.SamplingParams][]
 - [vllm.PoolingParams][]
 [](){ #multi-modality }
 ## Multi-Modality
 vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
-Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
+Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models)
 via the `multi_modal_data` field in [vllm.inputs.PromptType][].
 Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
--- a/docs/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/configuration/README.md
+++ b/docs/configuration/README.md
@ -4,6 +4,6 @@ This section lists the most common options for running vLLM.
 There are three main levels of configuration, from highest priority to lowest priority:
- [Request parameters][completions-api] and [input arguments][sampling-params]
+- [Request parameters](../serving/openai_compatible_server.md#completions-api) and [input arguments](../api/README.md#inference-parameters)
 - [Engine arguments](./engine_args.md)
 - [Environment variables](./env_vars.md)
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@ -23,7 +23,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
 !!! note
    With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
-    You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+    You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 ## Quantization
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -27,8 +27,6 @@ You can monitor the number of preemption requests through Prometheus metrics exp
 In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
 [](){ #chunked-prefill }
 ## Chunked Prefill
 Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
@ -174,14 +172,14 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 Known supported models (with corresponding benchmarks):
- dots_ocr (<gh-pr:25466>)
+- dots_ocr (<https://github.com/vllm-project/vllm/pull/25466>)
- GLM-4.1V or above (<gh-pr:23168>)
+- GLM-4.1V or above (<https://github.com/vllm-project/vllm/pull/23168>)
- InternVL (<gh-pr:23909>)
+- InternVL (<https://github.com/vllm-project/vllm/pull/23909>)
- Kimi-VL (<gh-pr:23817>)
+- Kimi-VL (<https://github.com/vllm-project/vllm/pull/23817>)
- Llama4 (<gh-pr:18368>)
+- Llama4 (<https://github.com/vllm-project/vllm/pull/18368>)
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
+- MiniCPM-V-2.5 or above (<https://github.com/vllm-project/vllm/pull/23327>, <https://github.com/vllm-project/vllm/pull/23948>)
- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
+- Qwen2-VL or above (<https://github.com/vllm-project/vllm/pull/22742>, <https://github.com/vllm-project/vllm/pull/24955>, <https://github.com/vllm-project/vllm/pull/25445>)
- Step3 (<gh-pr:22697>)
+- Step3 (<https://github.com/vllm-project/vllm/pull/22697>)
 ## Input Processing
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ
 ### Tune your workloads
-Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
+Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
 ### Future Topics We'll Cover
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@ -22,7 +22,7 @@ Unsure on where to start? Check out the following links for tasks to work on:
 ## License
-See <gh-file:LICENSE>.
+See [LICENSE](../../LICENSE).
 ## Developing
@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware,
 For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
 !!! tip
-    vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
+    vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](../../docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
    Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
@ -88,7 +88,7 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
 ### Documentation
-MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
+MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, [mkdocs.yaml](../../mkdocs.yaml).
 Get started with:
@ -152,7 +152,7 @@ pytest -s -v tests/test_logger.py
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 !!! important
-    If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
+    If you discover a security vulnerability, please follow the instructions [here](../../SECURITY.md).
 ## Pull Requests & Code Reviews
@ -162,7 +162,7 @@ code quality and improve the efficiency of the review process.
 ### DCO and Signed-off-by
-When contributing changes to this project, you must agree to the <gh-file:DCO>.
+When contributing changes to this project, you must agree to the [DCO](../../DCO).
 Commits must include a `Signed-off-by:` header which certifies agreement with
 the terms of the DCO.
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -6,9 +6,10 @@ toc_depth: 4
 vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
+- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
+- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
+- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
 - **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives
 [Benchmark CLI]: #benchmark-cli
@ -29,7 +30,7 @@ th {
 | Dataset | Online | Offline | Data Path |
 |---------|--------|---------|-----------|
 | ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
-| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
+| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
 | ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
 | BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
 | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
@ -714,7 +715,7 @@ Generate synthetic image inputs alongside random text prompts to stress-test vis
 Notes:
- Works only with online benchmark via the OpenAI  backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
+- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
 - Video sampling is not yet implemented.
 Start the server (example):
@ -822,7 +823,7 @@ you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backen
 - CLIP: `--backend openai-embeddings-clip`
 - VLM2Vec: `--backend openai-embeddings-vlm2vec`
-For other models, please add your own implementation inside <gh-file:vllm/benchmarks/lib/endpoint_request_func.py> to match the expected instruction format.
+For other models, please add your own implementation inside [vllm/benchmarks/lib/endpoint_request_func.py](../../vllm/benchmarks/lib/endpoint_request_func.py) to match the expected instruction format.
 You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
 For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
@ -924,7 +925,162 @@ throughput numbers correctly is also adjusted.
 </details>
-[](){ #performance-benchmarks }
+## Parameter Sweeps
 ### Online Benchmark
 [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
 Follow these steps to run the script:
 1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
 2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
 3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
    - Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
    ```json
    [
        {
            "max_num_seqs": 32,
            "max_num_batched_tokens": 1024
        },
        {
            "max_num_seqs": 64,
            "max_num_batched_tokens": 1024
        },
        {
            "max_num_seqs": 64,
            "max_num_batched_tokens": 2048
        },
        {
            "max_num_seqs": 128,
            "max_num_batched_tokens": 2048
        },
        {
            "max_num_seqs": 128,
            "max_num_batched_tokens": 4096
        },
        {
            "max_num_seqs": 256,
            "max_num_batched_tokens": 4096
        }
    ]
    ```
 4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
    - Example: Using different input/output lengths for random dataset:
    ```json
    [
        {
            "random_input_len": 128,
            "random_output_len": 32
        },
        {
            "random_input_len": 256,
            "random_output_len": 64
        },
        {
            "random_input_len": 512,
            "random_output_len": 128
        }
    ]
    ```
 5. Determine where you want to save the results, and pass that to `--output-dir`.
 Example command:
 ```bash
 python -m vllm.benchmarks.sweep.serve \
    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
    --serve-params benchmarks/serve_hparams.json \
    --bench-params benchmarks/bench_hparams.json \
    -o benchmarks/results
 ```
 !!! important
    If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
    You can use `--dry-run` to preview the commands to be run.
    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
 !!! note
    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
 !!! tip
    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
 ### SLA Auto-Tuner
 [`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
 For example, to ensure E2E latency within different target values for 99% of requests:
 ```json
 [
    {
        "p99_e2el_ms": "<=200"
    },
    {
        "p99_e2el_ms": "<=500"
    },
    {
        "p99_e2el_ms": "<=1000"
    },
    {
        "p99_e2el_ms": "<=2000"
    }
 ]
 ```
 Example command:
 ```bash
 python -m vllm.benchmarks.sweep.serve_sla \
    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
    --serve-params benchmarks/serve_hparams.json \
    --bench-params benchmarks/bench_hparams.json \
    --sla-params benchmarks/sla_hparams.json \
    --sla-variable max_concurrency \
    -o benchmarks/results
 ```
 The algorithm for adjusting the SLA variable is as follows:
 1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
    - For example, the initial request rate is set to the concurrency under infinite QPS.
 2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
 3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
 !!! important
    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
 ### Visualizer
 [`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
 Example command:
 ```bash
 python -m vllm.benchmarks.sweep.plot benchmarks/results/<timestamp> \
    --var-x max_concurrency \
    --row-by random_input_len \
    --col-by random_output_len \
    --curve-by api_server_count,max_num_batched_tokens \
    --filter-by 'max_concurrency<=1024'
 ```
 !!! tip
    You can use `--dry-run` to preview the figures to be plotted.
 ## Performance Benchmarks
@ -962,7 +1118,7 @@ For more results visualization, check the [visualizing the results](https://gith
 The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
-More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
 ### Continuous Benchmarking
@ -988,12 +1144,10 @@ The benchmarking currently runs on a predefined set of models configured in the
 All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
 [](){ #nightly-benchmarks }
 ## Nightly Benchmarks
 These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
 The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
-More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
+More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md).
--- a/docs/contributing/ci/failures.md
+++ b/docs/contributing/ci/failures.md
@ -64,7 +64,7 @@ Download the full log file from Buildkite locally.
 Strip timestamps and colorization:
-<gh-file:.buildkite/scripts/ci-clean-log.sh>
+[.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh)
 ```bash
 ./ci-clean-log.sh ci.log
@ -87,7 +87,7 @@ tail -525 ci_build.log | wl-copy
 CI test failures may be flaky. Use a bash loop to run repeatedly:
-<gh-file:.buildkite/scripts/rerun-test.sh>
+[.buildkite/scripts/rerun-test.sh](../../../.buildkite/scripts/rerun-test.sh)
 ```bash
 ./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@ -5,7 +5,7 @@ release in CI/CD. It is standard practice to submit a PR to update the
 PyTorch version as early as possible when a new [PyTorch stable
 release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
 This process is non-trivial due to the gap between PyTorch
-releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
+releases. Using <https://github.com/vllm-project/vllm/pull/16859> as an example, this document outlines common steps to achieve this
 update along with a list of potential issues and how to address them.
 ## Test PyTorch release candidates (RCs)
@ -85,9 +85,9 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
 it doesn't populate the cache, so re-running it to warm up the cache
 is ineffective.
-While ongoing efforts like [#17419](gh-issue:17419)
+While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
 address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
-to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
+to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`)
 when manually triggering a build on Buildkite. This branch accomplishes two things:
 1. Increase the timeout limit to 10 hours so that the build doesn't time out.
@ -100,35 +100,17 @@ to warm it up so that future builds are faster.
 ## Update dependencies
-Several vLLM dependencies, such as FlashInfer, also depend on PyTorch and need
+Several vLLM dependencies like xFormers depend on PyTorch and need
 to be updated accordingly. Rather than waiting for all of them to publish new
 releases (which would take too much time), they can be built from
 source to unblock the update process.
 ### FlashInfer
 Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
 ```bash
 export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
 export FLASHINFER_ENABLE_SM90=1
 uv pip install --system \
    --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
 ```
 One caveat is that building FlashInfer from source adds approximately 30
 minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a
 public location for immediate installation, such as [this FlashInfer wheel link](https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl). For future releases, contact the PyTorch release
 team if you want to get the package published there.
 ### xFormers
 Similar to FlashInfer, here is how to build and install xFormers from source:
 ```bash
-export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
+export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
 MAX_JOBS=16 uv pip install --system \
-    --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
+    --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
 ```
 ## Update all the different vLLM platforms
@ -138,5 +120,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
 for different platforms in vLLM CI/CD allows us to selectively choose
 which platforms to update. For instance, updating XPU requires the corresponding
 release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
-While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
+While <https://github.com/vllm-project/vllm/pull/16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
-<gh-pr:17444> completed the update for XPU.
+<https://github.com/vllm-project/vllm/pull/17444> completed the update for XPU.
--- a/docs/contributing/dockerfile/dockerfile.md
+++ b/docs/contributing/dockerfile/dockerfile.md
@ -1,6 +1,6 @@
 # Dockerfile
-We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+We provide a [docker/Dockerfile](../../../docker/Dockerfile) to construct the image for running an OpenAI compatible server with vLLM.
 More information about deploying with Docker can be found [here](../../deployment/docker.md).
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@ -1,7 +1,7 @@
 # Summary
 !!! important
-    Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
+    Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
 vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -5,7 +5,7 @@ This guide walks you through the steps to implement a basic vLLM model.
 ## 1. Bring your model code
 First, clone the PyTorch model code from the source repository.
-For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
+For instance, vLLM's [OPT model](../../../vllm/model_executor/models/opt.py) was adapted from
 HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
 !!! warning
@ -83,7 +83,7 @@ def forward(
    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
    If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
+For reference, check out our [Llama implementation](../../../vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out [vllm/model_executor/models](../../../vllm/model_executor/models) for more examples.
 ## 3. (Optional) Implement tensor parallelism and quantization support
@ -130,22 +130,22 @@ We consider 3 different scenarios:
 2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
 3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
-For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
+For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
 The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
-For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
+For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
 Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
 V0-only classes and code will be removed in the very near future.
-The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
+The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
-For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
+For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
 These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
-For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
+For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
 Please follow the same guidelines as case (2) for implementing these models.
 We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
-Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
+Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
-Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
+Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
-The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
+The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -507,7 +507,7 @@ return a schema of the tensors outputted by the HF processor that are related to
    ```
    !!! note
-        Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
+        Our [actual code](../../../vllm/model_executor/models/llava.py) additionally supports
        pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
 === "With postprocessing: Fuyu"
@ -569,7 +569,7 @@ return a schema of the tensors outputted by the HF processor that are related to
        ```
    !!! note
-        Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
+        Our [actual code](../../../vllm/model_executor/models/fuyu.py) has special handling
        for text-only inputs to prevent unnecessary warnings from HF processor.
    !!! note
@ -828,8 +828,8 @@ Some HF processors directly insert feature tokens without replacing anything in
 Examples:
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
+- BLIP-2 (insert at start of prompt): [vllm/model_executor/models/blip2.py](../../../vllm/model_executor/models/blip2.py)
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
+- Molmo (insert after `<|endoftext|>` token): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
 ### Handling prompt updates unrelated to multi-modal data
@ -837,9 +837,9 @@ Examples:
 Examples:
- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
+- Chameleon (appends `sep_token`): [vllm/model_executor/models/chameleon.py](../../../vllm/model_executor/models/chameleon.py)
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
+- Fuyu (appends `boa_token`): [vllm/model_executor/models/fuyu.py](../../../vllm/model_executor/models/fuyu.py)
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
+- Molmo (applies chat template which is not defined elsewhere): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
 ### Custom HF processor
@ -847,6 +847,6 @@ Some models don't define an HF processor class on HF Hub. In that case, you can
 Examples:
- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
+- DeepSeek-VL2: [vllm/model_executor/models/deepseek_vl2.py](../../../vllm/model_executor/models/deepseek_vl2.py)
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
+- InternVL: [vllm/model_executor/models/internvl.py](../../../vllm/model_executor/models/internvl.py)
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
+- Qwen-VL: [vllm/model_executor/models/qwen_vl.py](../../../vllm/model_executor/models/qwen_vl.py)
--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@ -8,11 +8,11 @@ This page provides detailed instructions on how to do so.
 ## Built-in models
-To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
+To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](../../getting_started/installation/gpu.md#build-wheel-from-source).
 This gives you the ability to modify the codebase and test your model.
-After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
+After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory.
-Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
+Then, add your model class to `_VLLM_MODELS` in [vllm/model_executor/models/registry.py](../../../vllm/model_executor/models/registry.py) so that it is automatically registered upon importing vLLM.
 Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
 !!! important
--- a/docs/contributing/model/tests.md
+++ b/docs/contributing/model/tests.md
@ -9,7 +9,7 @@ Without them, the CI for your PR will fail.
 ### Model loading
-Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
+Include an example HuggingFace repository for your model in [tests/models/registry.py](../../../tests/models/registry.py).
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
 !!! important
@ -26,26 +26,24 @@ Passing these tests provides more confidence that your implementation is correct
 ### Model correctness
-These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
+These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of [tests/models](../../../tests/models).
 #### Generative models
-For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
+For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in [tests/models/utils.py](../../../tests/models/utils.py):
 - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
 - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
 #### Pooling models
-For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
+For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
 [](){ #mm-processing-tests }
 ### Multi-modal processing
 #### Common tests
-Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
+Adding your model to [tests/models/multimodal/processing/test_common.py](../../../tests/models/multimodal/processing/test_common.py) verifies that the following input combinations result in the same outputs:
 - Text + multi-modal data
 - Tokens + multi-modal data
@ -54,6 +52,6 @@ Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py>
 #### Model-specific tests
-You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
+You can add a new file under [tests/models/multimodal/processing](../../../tests/models/multimodal/processing) to run tests that only apply to your model.
-For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
+For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in [tests/models/multimodal/processing/test_phi3v.py](../../../tests/models/multimodal/processing/test_phi3v.py).
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@ -248,9 +248,9 @@ No extra registration is required beyond having your model class available via t
 ## Examples in-tree
- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
+- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
+- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
+- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
 ## Test with the API
@ -278,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
      http://localhost:8000/v1/audio/translations
    ```
-Or check out more examples in <gh-file:examples/online_serving>.
+Or check out more examples in [examples/online_serving](../../../examples/online_serving).
 !!! note
    - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -33,7 +33,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
 #### Offline Inference
-Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
+Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example.
 #### OpenAI Server
@ -180,9 +180,13 @@ The profiling traces generated by the continuous profiling workflow are publicly
 The Python standard library includes
 [cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
 code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
-Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
+Both the `vllm.utils.profiling.cprofile` and `vllm.utils.profiling.cprofile_context` functions can be
 used to profile a section of code.
 !!! note
    The legacy import paths `vllm.utils.cprofile` and `vllm.utils.cprofile_context` are deprecated.
    Please use `vllm.utils.profiling.cprofile` and `vllm.utils.profiling.cprofile_context` instead.
 ### Example usage - decorator
 The first helper is a Python decorator that can be used to profile a function.
@ -190,9 +194,9 @@ If a filename is specified, the profile will be saved to that file. If no filena
 specified, profile data will be printed to stdout.
 ```python
-import vllm.utils
+from vllm.utils.profiling import cprofile
-@vllm.utils.cprofile("expensive_function.prof")
+@cprofile("expensive_function.prof")
 def expensive_function():
    # some expensive code
    pass
@ -204,13 +208,13 @@ The second helper is a context manager that can be used to profile a block of
 code. Similar to the decorator, the filename is optional.
 ```python
-import vllm.utils
+from vllm.utils.profiling import cprofile_context
 def another_function():
    # more expensive code
    pass
-with vllm.utils.cprofile_context("another_function.prof"):
+with cprofile_context("another_function.prof"):
    another_function()
 ```
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@ -1,7 +1,5 @@
 # Using Docker
 [](){ #deployment-docker-pre-built-image }
 ## Use vLLM's Official Docker Image
 vLLM offers an official Docker image for deployment.
@ -10,7 +8,7 @@ The image can be used to run OpenAI compatible server and is available on Docker
 ```bash
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    --env "HF_TOKEN=$HF_TOKEN" \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
@ -22,7 +20,7 @@ This image can also be used with other container engines such as [Podman](https:
 ```bash
 podman run --device nvidia.com/gpu=all \
  -v ~/.cache/huggingface:/root/.cache/huggingface \
-  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  --env "HF_TOKEN=$HF_TOKEN" \
  -p 8000:8000 \
  --ipc=host \
  docker.io/vllm/vllm-openai:latest \
@ -37,7 +35,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
    memory to share data between processes under the hood, particularly for tensor parallel inference.
 !!! note
-    Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
+    Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
    If you need to use those dependencies (having accepted the license terms),
    create a custom Dockerfile on top of the base image with an extra layer that installs them:
@ -62,11 +60,9 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
    RUN uv pip install --system git+https://github.com/huggingface/transformers.git
    ```
 [](){ #deployment-docker-build-image-from-source }
 ## Building vLLM's Docker Image from Source
-You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
+You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:
 ```bash
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
@ -128,7 +124,7 @@ To run vLLM with the custom-built Docker image:
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -p 8000:8000 \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    --env "HF_TOKEN=<secret>" \
    vllm/vllm-openai <args...>
 ```
--- a/docs/deployment/frameworks/anyscale.md
+++ b/docs/deployment/frameworks/anyscale.md
@ -1,11 +1,9 @@
 # Anyscale
 [](){ #deployment-anyscale }
 [Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
 Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
-without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
+without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh).
 When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@ -35,7 +35,7 @@ Deploy the following yaml file `lws.yaml`
              - name: vllm-leader
                image: docker.io/vllm/vllm-openai:latest
                env:
-                  - name: HUGGING_FACE_HUB_TOKEN
+                  - name: HF_TOKEN
                    value: <your-hf-token>
                command:
                  - sh
@ -83,7 +83,7 @@ Deploy the following yaml file `lws.yaml`
                    ephemeral-storage: 800Gi
                    cpu: 125
                env:
-                  - name: HUGGING_FACE_HUB_TOKEN
+                  - name: HF_TOKEN
                    value: <your-hf-token>
                volumeMounts:
                  - mountPath: /dev/shm
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@ -36,7 +36,7 @@ pip install -U vllm \
    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
    ```
-1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
+1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
 1. Run the script
@ -74,7 +74,7 @@ pip install vllm \
    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
    ```
-1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
+1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
 1. Run the script:
--- a/docs/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@ -20,7 +20,7 @@ pip install vllm streamlit openai
    vllm serve Qwen/Qwen1.5-0.5B-Chat
    ```
-1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
+1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
 1. Start the streamlit web UI and start to chat:
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@ -82,7 +82,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
              "vllm serve meta-llama/Llama-3.2-1B-Instruct"
            ]
            env:
-            - name: HUGGING_FACE_HUB_TOKEN
+            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
@ -209,7 +209,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
              ]
              env:
-              - name: HUGGING_FACE_HUB_TOKEN
+              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
@ -298,7 +298,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
              ]
              env:
-              - name: HUGGING_FACE_HUB_TOKEN
+              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
--- a/docs/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@ -2,8 +2,6 @@
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 [](){ #nginxloadbalancer-nginx-build }
 ## Build Nginx Container
 This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
@ -27,8 +25,6 @@ Build the container:
 docker build . -f Dockerfile.nginx --tag nginx-lb
 ```
 [](){ #nginxloadbalancer-nginx-conf }
 ## Create Simple Nginx Config file
 Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
@ -53,8 +49,6 @@ Create a file named `nginx_conf/nginx.conf`. Note that you can add as many serve
    }
    ```
 [](){ #nginxloadbalancer-nginx-vllm-container }
 ## Build vLLM Container
 ```bash
@ -73,16 +67,12 @@ docker build \
    --build-arg https_proxy=$https_proxy
 ```
 [](){ #nginxloadbalancer-nginx-docker-network }
 ## Create Docker Network
 ```bash
 docker network create vllm_nginx
 ```
 [](){ #nginxloadbalancer-nginx-launch-container }
 ## Launch vLLM Containers
 Notes:
@ -122,8 +112,6 @@ Notes:
 !!! note
    If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
 [](){ #nginxloadbalancer-nginx-launch-nginx }
 ## Launch Nginx
 ```bash
@ -135,8 +123,6 @@ docker run \
    --name nginx-lb nginx-lb:latest
 ```
 [](){ #nginxloadbalancer-nginx-verify-nginx }
 ## Verify That vLLM Servers Are Ready
 ```bash
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@ -47,9 +47,9 @@ Here is a sample of `LLM` class usage:
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```
-More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
+More API details can be found in the [Offline Inference](../api/README.md#offline-inference) section of the API docs.
-The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
+The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](../../vllm/entrypoints/llm.py).
 ### OpenAI-Compatible API Server
@ -60,7 +60,7 @@ This server can be started using the `vllm serve` command.
 vllm serve <model>
 ```
-The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
+The code for the `vllm` CLI can be found in [vllm/entrypoints/cli/main.py](../../vllm/entrypoints/cli/main.py).
 Sometimes you may see the API server entrypoint used directly instead of via the
 `vllm` CLI command. For example:
@ -74,7 +74,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
    `python -m vllm.entrypoints.openai.api_server` is deprecated
    and may become unsupported in a future release.
-That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
+That code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/entrypoints/openai/api_server.py).
 More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
@ -101,7 +101,7 @@ processing.
 - **Output Processing**: Processes the outputs generated by the model, decoding the
  token IDs from a language model into human-readable text.
-The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
+The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py](../../vllm/engine/llm_engine.py).
 ### AsyncLLMEngine
@ -111,9 +111,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
 can handle multiple concurrent requests and stream outputs to clients.
 The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
-API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
+API server that serves as a simpler example in [vllm/entrypoints/api_server.py](../../vllm/entrypoints/api_server.py).
-The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
+The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py](../../vllm/engine/async_llm_engine.py).
 ## Worker
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@ -17,7 +17,7 @@ In this document we will discuss the:
    In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches).
 !!! note
-    The following contents are mostly based on the last commit of <gh-pr:20059>.
+    The following contents are mostly based on the last commit of <https://github.com/vllm-project/vllm/pull/20059>.
 ## Motivation
@ -92,7 +92,7 @@ where `num_tokens` can be the padded token length, and `uniform_decode` is deter
 The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
 !!! note
-    The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<gh-pr:23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
+    The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
 ### `CudagraphDispatcher`
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@ -2,7 +2,7 @@
 ## Introduction
-FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py)
+FusedMoEModularKernel is implemented [here](../..//vllm/model_executor/layers/fused_moe/modular_kernel.py)
 Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
@ -44,7 +44,7 @@ FusedMoEModularKernel splits the FusedMoE operation into 3 parts,
 The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
-Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
+Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
 `FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
 The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C
 #### Step 1: Add an All2All manager
-The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
+The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](../../vllm/distributed/device_communicators/all2all.py).
 #### Step 2: Add a FusedMoEPrepareAndFinalize Type
@ -213,29 +213,29 @@ Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vl
 ### How To Unit Test
-We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py).
+We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py).
 The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
 compatible, runs some correctness tests.
 If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
-1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](gh-file:tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
+1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](../../tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
 2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
 `Config::is_fe_16bit_supported()`,  `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
-`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](gh-file:tests/kernels/moe/modular_kernel_tools/common.py)
+`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py)
 Doing this will add the new implementation to the test suite.
 ### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
-The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
+The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
 Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
 As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
 with incompatible types, the script will error.
 ### How To Profile
-Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
+Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
 The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
 `FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
 Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@ -6,7 +6,7 @@ When performing an inference with IO Processor plugins, the prompt type is defin
 ## Writing an IO Processor Plugin
-IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
+IO Processor plugins implement the [`IOProcessor`][vllm.plugins.io_processors.interface.IOProcessor] interface:
 ```python
 IOProcessorInput = TypeVar("IOProcessorInput")
@ -67,9 +67,9 @@ The `parse_request` method is used for validating the user prompt and converting
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.
 ## Using an IO Processor plugin
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -1,12 +1,12 @@
 # Metrics
-Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
+vLLM exposes a rich set of metrics to support observability and capacity planning for the V1 engine.
 ## Objectives
- Achieve parity of metrics between v0 and v1.
+- Provide comprehensive coverage of engine and request level metrics to aid production monitoring.
- The priority use case is accessing these metrics via Prometheus, as this is what we expect to be used in production environments.
+- Prioritize Prometheus integrations, as this is what we expect to be used in production environments.
- Logging support (i.e. printing metrics to the info log) is provided for more ad-hoc testing, debugging, development, and exploratory use cases.
+- Offer logging support (i.e. printing metrics to the info log) for ad-hoc testing, debugging, development, and exploratory use cases.
 ## Background
@ -17,45 +17,36 @@ Metrics in vLLM can be categorized as follows:
 The mental model is that server-level metrics help explain the values of request-level metrics.
-### v0 Metrics
+### Metrics Overview
-In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
+### v1 Metrics
- `vllm:num_requests_running` (Gauge)
+In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
- `vllm:num_requests_swapped` (Gauge)
+
- `vllm:num_requests_waiting` (Gauge)
+- `vllm:num_requests_running` (Gauge) - Number of requests currently running.
- `vllm:gpu_cache_usage_perc` (Gauge)
+- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
- `vllm:cpu_cache_usage_perc` (Gauge)
+- `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
- `vllm:gpu_prefix_cache_hit_rate` (Gauge)
+- `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
- `vllm:cpu_prefix_cache_hit_rate` (Gauge)
+- `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
- `vllm:prompt_tokens_total` (Counter)
+- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
- `vllm:generation_tokens_total` (Counter)
+- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
- `vllm:request_success_total` (Counter)
+- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
- `vllm:request_prompt_tokens` (Histogram)
+- `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
- `vllm:request_generation_tokens` (Histogram)
+- `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
- `vllm:time_to_first_token_seconds` (Histogram)
+- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
- `vllm:time_per_output_token_seconds` (Histogram)
+- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
- `vllm:e2e_request_latency_seconds` (Histogram)
+- `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
- `vllm:request_queue_time_seconds` (Histogram)
+- `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
- `vllm:request_inference_time_seconds` (Histogram)
+- `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
- `vllm:request_prefill_time_seconds` (Histogram)
+- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
- `vllm:request_decode_time_seconds` (Histogram)
+- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
- `vllm:request_max_num_generation_tokens` (Histogram)
+- `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
- `vllm:num_preemptions_total` (Counter)
+- `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
- `vllm:cache_config_info` (Gauge)
+- `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
- `vllm:lora_requests_info` (Gauge)
+- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
- `vllm:tokens_total` (Counter)
+- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
- `vllm:iteration_tokens_total` (Histogram)
+- `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
- `vllm:time_in_queue_requests` (Histogram)
+- `vllm:request_decode_time_seconds` (Histogram) - Request decode time.
 - `vllm:model_forward_time_milliseconds` (Histogram)
 - `vllm:model_execute_time_milliseconds` (Histogram)
 - `vllm:request_params_n` (Histogram)
 - `vllm:request_params_max_tokens` (Histogram)
 - `vllm:spec_decode_draft_acceptance_rate` (Gauge)
 - `vllm:spec_decode_efficiency` (Gauge)
 - `vllm:spec_decode_num_accepted_tokens_total` (Counter)
 - `vllm:spec_decode_num_draft_tokens_total` (Counter)
 - `vllm:spec_decode_num_emitted_tokens_total` (Counter)
 These are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md).
@ -80,13 +71,13 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
 - `vllm:request_decode_time_seconds` - Requests decode time.
 - `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.
-See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
+See [the PR which added this Dashboard](https://github.com/vllm-project/vllm/pull/2316) for interesting and useful background on the choices made here.
 ### Prometheus Client Library
-Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
+Prometheus support was initially added [using the aioprometheus library](https://github.com/vllm-project/vllm/pull/1890), but a switch was made quickly to [prometheus_client](https://github.com/vllm-project/vllm/pull/2730). The rationale is discussed in both linked PRs.
-With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
+During those migrations we briefly lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](https://github.com/vllm-project/vllm/pull/15657):
 ```bash
 $ curl http://0.0.0.0:8000/metrics 2>/dev/null  | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
@ -99,7 +90,9 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
 ### Multi-process Mode
-In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
+Historically, metrics were collected in the engine core process and multiprocess mode was used to make them available in the API server process. See <https://github.com/vllm-project/vllm/pull/7279>.
 More recently, metrics are collected in the API server process and multiprocess mode is only used when `--api-server-count > 1`. See <https://github.com/vllm-project/vllm/pull/17546> and details on [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing).
 ### Built in Python/Process Metrics
@ -116,41 +109,37 @@ The following metrics are supported by default by `prometheus_client`, but they
 - `process_open_fds`
 - `process_max_fds`
-This is relevant because if we move away from multiprocess mode in v1,
+Therefore, these metrics are unavailable when `--api-server-count > 1`. It's questionable how relevant these are since they do not aggregate these stats for all processes that make up a vLLM instance.
 we get these back. However, it's questionable how relevant these are
 if they don't aggregate these stats for all processes that make up a
 vLLM instance.
-### v0 PRs and Issues
+## Metrics Design
-For background, these are some of the relevant PRs which added the v0 metrics:
+The ["Even Better Observability"](https://github.com/vllm-project/vllm/issues/3616) feature where was where much of the metrics design was planned. For example, see where [a detailed roadmap was laid out](https://github.com/vllm-project/vllm/issues/3616#issuecomment-2030858781).
- <gh-pr:1890>
+### Legacy PRs
 - <gh-pr:2316>
 - <gh-pr:2730>
 - <gh-pr:4464>
 - <gh-pr:7279>
-Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
+To help understand the background to the metrics design, here are some of the relevant PRs which added the original, now legacy, metrics:
-## v1 Design
+- <https://github.com/vllm-project/vllm/pull/1890>
 - <https://github.com/vllm-project/vllm/pull/2316>
 - <https://github.com/vllm-project/vllm/pull/2730>
 - <https://github.com/vllm-project/vllm/pull/4464>
 - <https://github.com/vllm-project/vllm/pull/7279>
-### v1 PRs
+### Metrics Implementation PRs
-For background, here are the relevant v1 PRs relating to the v1
+For background, here are the relevant PRs relating to the metrics implementation <https://github.com/vllm-project/vllm/issues/10582>:
 metrics issue <gh-issue:10582>:
- <gh-pr:11962>
+- <https://github.com/vllm-project/vllm/pull/11962>
- <gh-pr:11973>
+- <https://github.com/vllm-project/vllm/pull/11973>
- <gh-pr:10907>
+- <https://github.com/vllm-project/vllm/pull/10907>
- <gh-pr:12416>
+- <https://github.com/vllm-project/vllm/pull/12416>
- <gh-pr:12478>
+- <https://github.com/vllm-project/vllm/pull/12478>
- <gh-pr:12516>
+- <https://github.com/vllm-project/vllm/pull/12516>
- <gh-pr:12530>
+- <https://github.com/vllm-project/vllm/pull/12530>
- <gh-pr:12561>
+- <https://github.com/vllm-project/vllm/pull/12561>
- <gh-pr:12579>
+- <https://github.com/vllm-project/vllm/pull/12579>
- <gh-pr:12592>
+- <https://github.com/vllm-project/vllm/pull/12592>
- <gh-pr:12644>
+- <https://github.com/vllm-project/vllm/pull/12644>
 ### Metrics Collection
@ -394,15 +383,14 @@ distinguish between per-adapter counts. This should be revisited.
 Note that `multiprocess_mode="livemostrecent"` is used - the most
 recent metric is used, but only from currently running processes.
-This was added in <gh-pr:9477> and there is
+This was added in <https://github.com/vllm-project/vllm/pull/9477> and there is
 [at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
-If we revisit this design and deprecate the old metric, we should reduce
+If we revisit this design and deprecate the old metric, we should
-the need for a significant deprecation period by making the change in
+coordinate with downstream users so they can migrate before the removal.
 v0 also and asking this project to move to the new metric.
 ### Prefix Cache metrics
-The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
+The discussion in <https://github.com/vllm-project/vllm/issues/10582> about adding prefix cache metrics yielded
 some interesting points which may be relevant to how we approach
 future metrics.
@ -439,8 +427,8 @@ suddenly (from their perspective) when it is removed, even if there is
 an equivalent metric for them to use.
 As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
-[deprecated](gh-pr:2764) (with a comment in the code),
+[deprecated](https://github.com/vllm-project/vllm/pull/2764) (with a comment in the code),
-[removed](gh-pr:12383), and then [noticed by a user](gh-issue:13218).
+[removed](https://github.com/vllm-project/vllm/pull/12383), and then [noticed by a user](https://github.com/vllm-project/vllm/issues/13218).
 In general:
@ -460,40 +448,38 @@ the project-wide deprecation policy.
 ### Unimplemented - `vllm:tokens_total`
-Added by <gh-pr:4464>, but apparently never implemented. This can just be
+Added by <https://github.com/vllm-project/vllm/pull/4464>, but apparently never implemented. This can just be
 removed.
 ### Duplicated - Queue Time
 The `vllm:time_in_queue_requests` Histogram metric was added by
-<gh-pr:9659> and its calculation is:
+<https://github.com/vllm-project/vllm/pull/9659> and its calculation is:
 ```python
    self.metrics.first_scheduled_time = now
    self.metrics.time_in_queue = now - self.metrics.arrival_time
 ```
-Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
+Two weeks later, <https://github.com/vllm-project/vllm/pull/4464> added `vllm:request_queue_time_seconds` leaving
 us with:
 ```python
 if seq_group.is_finished():
-    if (
+    if (seq_group.metrics.first_scheduled_time is not None and
-        seq_group.metrics.first_scheduled_time is not None
+            seq_group.metrics.first_token_time is not None):
        and seq_group.metrics.first_token_time is not None
    ):
        time_queue_requests.append(
            seq_group.metrics.first_scheduled_time -
-            seq_group.metrics.arrival_time
+            seq_group.metrics.arrival_time)
        )
    ...
    if seq_group.metrics.time_in_queue is not None:
-        time_in_queue_requests.append(seq_group.metrics.time_in_queue)
+        time_in_queue_requests.append(
            seq_group.metrics.time_in_queue)
 ```
 This seems duplicative, and one of them should be removed. The latter
 is used by the Grafana dashboard, so we should deprecate or remove the
-former from v0.
+former.
 ### Prefix Cache Hit Rate
@ -502,7 +488,7 @@ See above - we now expose 'queries' and 'hits' counters rather than a
 ### KV Cache Offloading
-Two v0 metrics relate to a "swapped" preemption mode that is no
+Two legacy metrics relate to a "swapped" preemption mode that is no
 longer relevant in v1:
 - `vllm:num_requests_swapped`
@ -513,7 +499,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
 memory. This is also known as "KV cache offloading" and is configured
 with `--swap-space` and `--preemption-mode`.
-In v0, [vLLM has long supported beam search](gh-issue:6226). The
+Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
 SequenceGroup encapsulated the idea of N Sequences which
 all shared the same prompt kv blocks. This enabled KV cache block
 sharing between requests, and copy-on-write to do branching. CPU
@ -526,7 +512,7 @@ and the part of the prompt that was evicted can be recomputed.
 SequenceGroup was removed in V1, although a replacement will be
 required for "parallel sampling" (`n>1`).
-[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
+[Beam search was moved out of the core](https://github.com/vllm-project/vllm/issues/8306). There was a
 lot of complex code for a very uncommon feature.
 In V1, with prefix caching being better (zero over head) and therefore
@ -537,11 +523,11 @@ better.
 ### Parallel Sampling
-Some v0 metrics are only relevant in the context of "parallel
+Some legacy metrics are only relevant in the context of "parallel
 sampling". This is where the `n` parameter in a request is used to
 request multiple completions from the same prompt.
-As part of adding parallel sampling support in <gh-pr:10980>, we should
+As part of adding parallel sampling support in <https://github.com/vllm-project/vllm/pull/10980>, we should
 also add these metrics.
 - `vllm:request_params_n` (Histogram)
@ -556,7 +542,7 @@ also add these metrics.
 ### Speculative Decoding
-Some v0 metrics are specific to "speculative decoding". This is where
+Some legacy metrics are specific to "speculative decoding". This is where
 we generate candidate tokens using a faster, approximate method or
 model and then validate those tokens with the larger model.
@ -566,9 +552,9 @@ model and then validate those tokens with the larger model.
 - `vllm:spec_decode_num_draft_tokens_total` (Counter)
 - `vllm:spec_decode_num_emitted_tokens_total` (Counter)
-There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
+There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
 speculative decoding to v1. Other techniques will follow. We should
-revisit the v0 metrics in this context.
+revisit these metrics in this context.
 !!! note
    We should probably expose acceptance rate as separate accepted
@ -587,7 +573,7 @@ see:
 - [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
 - [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
 - [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
- <gh-issue:5041> and <gh-pr:12726>.
+- <https://github.com/vllm-project/vllm/issues/5041> and <https://github.com/vllm-project/vllm/pull/12726>.
 This is a non-trivial topic. Consider this comment from Rob:
@ -641,7 +627,7 @@ metrics are often relatively straightforward to add:
   metrics are usually of very limited use unless they can be enabled
   by default and in production.
 3. They have an impact on development and maintenance of the
-   project. Every metric added to v0 has made this v1 effort more
+   project. Every metric added over time has made this effort more
   time-consuming, and perhaps not all metrics justify this ongoing
   investment in their maintenance.
@ -652,24 +638,24 @@ performance and health. Tracing, on the other hand, tracks individual
 requests as they move through different services and components. Both
 fall under the more general heading of "Observability".
-v0 has support for OpenTelemetry tracing:
+vLLM has support for OpenTelemetry tracing:
- Added by <gh-pr:4687>
+- Added by <https://github.com/vllm-project/vllm/pull/4687> and reinstated by <https://github.com/vllm-project/vllm/pull/20372>
 - Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
 - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
 - [User-facing docs](../examples/online_serving/opentelemetry.md)
 - [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
 - [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
-  
+
 OpenTelemetry has a
 [Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
-Since metrics is a big enough topic on its own, we are going to tackle
+Since metrics is a big enough topic on its own, we consider the topic
-the topic of tracing in v1 separately.
+of tracing to be quite separate from metrics.
 ### OpenTelemetry Model Forward vs Execute Time
-In v0, we have the following two metrics:
+The current implementation exposes the following two metrics:
 - `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
  in the model forward pass when this request was in the batch.
@ -685,7 +671,7 @@ documentation for this option states:
 > use of possibly costly and or blocking operations and hence might
 > have a performance impact.
-The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
+The metrics were added by <https://github.com/vllm-project/vllm/pull/7089> and who up in an OpenTelemetry trace
 as:
 ```text
--- a/docs/design/mm_processing.md
+++ b/docs/design/mm_processing.md
@ -1,6 +1,6 @@
 # Multi-Modal Data Processing
-To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
+To enable various optimizations in vLLM such as [chunked prefill](../configuration/optimization.md#chunked-prefill) and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
 Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
@ -41,14 +41,10 @@ While HF processors support text + multi-modal inputs natively, this is not so f
 Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
 [](){ #mm-dummy-text }
 ### Dummy text
 We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
 [](){ #mm-automatic-prompt-updating }
 ### Automatic prompt updating
 We address the second issue by implementing model-agnostic code in
@ -60,8 +56,8 @@ With the help of dummy text and automatic prompt updating, our multi-modal proce
 ## Processor Output Caching
-Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
+Some HF processors, such as the one for Qwen2-VL, are [very slow](https://github.com/vllm-project/vllm/issues/9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
 When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
-Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other.
+Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@ -92,8 +92,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
 | flashinfer                   | standard              | nvfp4,</br>fp8   | T             | <sup>5</sup>                                                | N                     | Y       | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts]                                                                            |
 | gpt oss triton               | standard              | N/A              | N/A           | <sup>5</sup>                                                | Y                     | Y       | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts]                                                                    |
 | deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup>  | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts]                                                 |
-| marlin                       | standard              | <sup>3</sup>     | <sup>3</sup>  | silu,</br>swigluoai                                         | Y                     | N       | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe]                                                                                                                                                                                                                |
+| marlin                       | standard              | <sup>3</sup>     | <sup>3</sup>  | silu,</br>swigluoai                                         | Y                     | Y       | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]          |
-| marlin experts               | standard              | N/A              | N/A           | silu,</br>swigluoai                                         | Y                     | Y       | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts]                                                                                                                                                                                                                      |
+| marlin experts               | standard,</br>batched | N/A              | N/A           | silu,</br>swigluoai                                         | Y                     | Y       | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]                                                                                                            |
 | trtllm                       | standard              | mxfp4,</br>nvfp4 | G(16),G(32)   | <sup>5</sup>                                                | N                     | Y       | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts]                                                                                                                                                                                                                      |
 | pallas                       | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe]                                                                                                                                                                                                                                    |
 | iterative                    | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe]                                                                                                                                                                                                                           |
@ -115,6 +115,6 @@ The following table shows "families" of modular kernels that are intended to wor
 | backend                          | `FusedMoEPrepareAndFinalize` subclasses                    | `FusedMoEPermuteExpertsUnpermute` subclasses                                                                               |
 |----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
-| deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`      |
+| deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`                                  |
-| deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`|
+| deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`|
-| flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                        |
+| flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                                                    |
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@ -2,7 +2,7 @@
 ## Debugging
-Please see the [Troubleshooting][troubleshooting-python-multiprocessing]
+Please see the [Troubleshooting](../usage/troubleshooting.md#python-multiprocessing)
 page for information on known issues and how to solve them.
 ## Introduction
@ -82,7 +82,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:
 Related PRs:
- <gh-pr:8823>
+- <https://github.com/vllm-project/vllm/pull/8823>
 ## Prior State in v1
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@ -41,7 +41,7 @@ Every plugin has three parts:
 1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
 2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
-3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
+3. **Plugin value**: The fully qualified name of the function or module to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
 ## Types of supported plugins
@ -51,6 +51,8 @@ Every plugin has three parts:
 - **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
 - **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase.
 ## Guidelines for Writing Plugins
 - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@ -213,22 +213,22 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 ![Example Time 1](../assets/design/prefix_caching/example-time-1.png)
-**Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
+**Time 2: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
-![Example Time 3](../assets/design/prefix_caching/example-time-3.png)
+![Example Time 2](../assets/design/prefix_caching/example-time-3.png)
-**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
+**Time 3: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
-![Example Time 4](../assets/design/prefix_caching/example-time-4.png)
+![Example Time 3](../assets/design/prefix_caching/example-time-4.png)
-**Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
+**Time 4: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
-![Example Time 5](../assets/design/prefix_caching/example-time-5.png)
+![Example Time 4](../assets/design/prefix_caching/example-time-5.png)
-**Time 6: Request 1 is finished and free.**
+**Time 5: Request 1 is finished and free.**
-![Example Time 6](../assets/design/prefix_caching/example-time-6.png)
+![Example Time 5](../assets/design/prefix_caching/example-time-6.png)
-**Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
+**Time 6: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
-![Example Time 7](../assets/design/prefix_caching/example-time-7.png)
+![Example Time 6](../assets/design/prefix_caching/example-time-7.png)
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@ -19,8 +19,8 @@ vLLM will take all the available factors into consideration, and decide a direct
 The factors considered include:
- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](gh-file:vllm/config))
+- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](../../vllm/config))
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
+- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](../../vllm/compilation/compiler_interface.py))
 - The model's forward function and the relevant functions called by the forward function (see below)
 With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
--- a/docs/features/README.md
+++ b/docs/features/README.md
@ -36,45 +36,43 @@ th:not(:first-child) {
 }
 </style>
-| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
+| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
 |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
-| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
+| [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
 | [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
 | [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
 | [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
 | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
 | [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ✅ | ✅ | ✅ | | | | | | | | |
 | <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
 | <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
 | multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
-| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
+| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/pull/4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
-| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
+| best-of | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ✅ | ✅ | | |
-| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
+| beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
-| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
+| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](https://github.com/vllm-project/vllm/issues/25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
 \* Chunked prefill and prefix caching are only applicable to last-token pooling.  
 <sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
 [](){ #feature-x-hardware }
 ### Feature x Hardware
-| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU |
+| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU | Intel GPU |
-|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
+|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------|
-| [CP][chunked-prefill]                                     | [❌](gh-issue:2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [APC](automatic_prefix_caching.md)                        | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
-| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ |
+| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
-| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ | ✅        |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ | ✅        |
-| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     | ❌ |
+| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ❌ | ✅        |
-| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ?     | [❌](gh-issue:25097) |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ?     | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅       |
--- a/docs/features/automatic_prefix_caching.md
+++ b/docs/features/automatic_prefix_caching.md
@ -11,7 +11,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries,
 Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
-<gh-file:examples/offline_inference/automatic_prefix_caching.py>
+[examples/offline_inference/automatic_prefix_caching.py](../../examples/offline_inference/automatic_prefix_caching.py)
 ## Example workloads
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@ -17,14 +17,14 @@ Two main reasons:
 ## Usage example
-Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
+Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.
 Now supports 5 types of connectors:
- **SharedStorageConnector**: refer to <gh-file:examples/offline_inference/disaggregated-prefill-v1/run.sh> for the example usage of SharedStorageConnector disaggregated prefilling.
+- **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
- **LMCacheConnectorV1**: refer to <gh-file:examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh> for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
+- **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
+- **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
- **P2pNcclConnector**: refer to <gh-file:examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh> for the example usage of P2pNcclConnector disaggregated prefilling.
+- **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
  ```bash
@ -45,7 +45,7 @@ For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
 ## Benchmarks
-Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
+Please refer to [benchmarks/disagg_benchmarks](../../benchmarks/disagg_benchmarks) for disaggregated prefilling benchmarks.
 ## Development
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@ -47,7 +47,7 @@ the third parameter is the path to the LoRA adapter.
    )
    ```
-Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out [examples/offline_inference/multilora_inference.py](../../examples/offline_inference/multilora_inference.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
 ## Serving LoRA Adapters
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -1,9 +1,9 @@
 # Multimodal Inputs
-This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
+This page teaches you how to pass multi-modal inputs to [multi-modal models](../models/supported_models.md#list-of-multimodal-language-models) in vLLM.
 !!! note
-    We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
+    We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes,
    and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
 !!! tip
@ -129,7 +129,7 @@ You can pass a single image to the `'image'` field of the multi-modal dictionary
        print(generated_text)
    ```
-Full example: <gh-file:examples/offline_inference/vision_language.py>
+Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
@ -162,7 +162,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
        print(generated_text)
    ```
-Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
+Full example: [examples/offline_inference/vision_language_multi_image.py](../../examples/offline_inference/vision_language_multi_image.py)
 If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
@ -346,26 +346,32 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
    !!! note
        'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
-Full example: <gh-file:examples/offline_inference/vision_language.py>
+Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)
 ### Audio Inputs
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
-Full example: <gh-file:examples/offline_inference/audio_language.py>
+Full example: [examples/offline_inference/audio_language.py](../../examples/offline_inference/audio_language.py)
 ### Embedding Inputs
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
 You must enable this feature via `enable_mm_embeds=True`.
 !!! warning
    The vLLM engine may crash if incorrect shape of embeddings is passed.
    Only enable this flag for trusted users!
 ??? code
    ```python
    from vllm import LLM
    # Inference with image embeddings as input
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf", enable_mm_embeds=True)
    # Refer to the HuggingFace repo for the correct format to use
    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
@ -397,7 +403,11 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
    image_embeds = torch.load(...)
    # Qwen2-VL
-    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    llm = LLM(
        "Qwen/Qwen2-VL-2B-Instruct",
        limit_mm_per_prompt={"image": 4},
        enable_mm_embeds=True,
    )
    mm_data = {
        "image": {
            "image_embeds": image_embeds,
@ -407,7 +417,12 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
    }
    # MiniCPM-V
-    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+    llm = LLM(
        "openbmb/MiniCPM-V-2_6",
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 4},
        enable_mm_embeds=True,
    )
    mm_data = {
        "image": {
            "image_embeds": image_embeds,
@ -434,11 +449,11 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
    A chat template is **required** to use Chat Completions API.
    For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
-    If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
+    If no default chat template is available, we will first look for a built-in fallback in [vllm/transformers_utils/chat_templates/registry.py](../../vllm/transformers_utils/chat_templates/registry.py).
    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
-    For certain models, we provide alternative chat templates inside <gh-dir:examples>.
+    For certain models, we provide alternative chat templates inside [examples](../../examples).
-    For example, VLM2Vec uses <gh-file:examples/template_vlm2vec_phi3v.jinja> which is different from the default one for Phi-3-Vision.
+    For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
 ### Image Inputs
@ -524,7 +539,7 @@ Then, you can use the OpenAI client as follows:
    print("Chat completion output:", chat_response.choices[0].message.content)
    ```
-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
 !!! tip
    Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
@ -595,7 +610,7 @@ Then, you can use the OpenAI client as follows:
    print("Chat completion output from image url:", result)
    ```
-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
 !!! note
    By default, the timeout for fetching videos through HTTP URL is `30` seconds.
@ -719,7 +734,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
    print("Chat completion output from audio url:", result)
    ```
-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
 !!! note
    By default, the timeout for fetching audios through HTTP URL is `10` seconds.
@ -732,7 +747,13 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
 ### Embedding Inputs
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
-pass a tensor of shape to the corresponding field of the multi-modal dictionary.
+pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
 You must enable this feature via the `--enable-mm-embeds` flag in `vllm serve`.
 !!! warning
    The vLLM engine may crash if incorrect shape of embeddings is passed.
    Only enable this flag for trusted users!
 #### Image Embedding Inputs
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
 Install the NIXL library: `uv pip install nixl`, as a quick start.
 - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
+- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
 For non-cuda platform, please install nixl with ucx build from source, instructed as below.
@ -170,6 +170,6 @@ Support use case: Prefill with 'HND' and decode with 'NHD' with experimental con
 Refer to these example scripts in the vLLM repository:
- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
+- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
+- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
+- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
--- a/docs/features/prompt_embeds.md
+++ b/docs/features/prompt_embeds.md
@ -16,16 +16,20 @@ To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
 You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
-<gh-file:examples/offline_inference/prompt_embed_inference.py>
+[examples/offline_inference/prompt_embed_inference.py](../../examples/offline_inference/prompt_embed_inference.py)
 ## Online Serving
-Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
+Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package and are enabled by the `--enable-prompt-embeds` flag in `vllm serve`.
 When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
 Prompt embeddings are passed in as base64 encoded torch tensors.
 !!! warning
    The vLLM engine may crash if incorrect shape of embeddings is passed.
    Only enable this flag for trusted users!
 ### Transformers Inputs via OpenAI Client
 First, launch the OpenAI-compatible server:
@ -37,4 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \
 Then, you can use the OpenAI client as follows:
-<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
+[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/online_serving/prompt_embed_inference_with_openai_client.py)
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@ -64,4 +64,4 @@ th:not(:first-child) {
 !!! note
    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
-    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
+    For the most up-to-date information on hardware support and quantization methods, please refer to [vllm/model_executor/layers/quantization](../../../vllm/model_executor/layers/quantization) or consult with the vLLM development team.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -196,7 +196,7 @@ The reasoning content is also available when both tool calling and the reasoning
    print(f"Arguments: {tool_call.arguments}")
    ```
-For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
+For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py).
 ## Limitations
@ -204,7 +204,7 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
 ## How to support a new reasoning model
-You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
 ??? code
@ -264,7 +264,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
            """
    ```
-Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
 ??? code
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@ -3,7 +3,7 @@
 !!! warning
    Please note that speculative decoding in vLLM is not yet optimized and does
    not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
-    The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
+    The work to optimize it is ongoing and can be followed here: <https://github.com/vllm-project/vllm/issues/4630>
 !!! warning
    Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
@ -183,7 +183,7 @@ A variety of speculative models of this type are available on HF hub:
 ## Speculating using EAGLE based draft models
 The following code configures vLLM to use speculative decoding where proposals are generated by
-an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](../../examples/offline_inference/spec_decode.py).
 ??? code
@ -218,8 +218,8 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
 A few important things to consider when using the EAGLE based draft models:
 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
-   be able to be loaded and used directly by vLLM after <gh-pr:12304>.
+   be able to be loaded and used directly by vLLM after <https://github.com/vllm-project/vllm/pull/12304>.
-   If you are using vllm version before <gh-pr:12304>, please use the
+   If you are using vllm version before <https://github.com/vllm-project/vllm/pull/12304>, please use the
   [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
   and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
@ -229,7 +229,7 @@ A few important things to consider when using the EAGLE based draft models:
 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
   reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
-   investigation and tracked here: <gh-issue:9565>.
+   investigation and tracked here: <https://github.com/vllm-project/vllm/issues/9565>.
 4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
   That is, to specify `"method": "eagle3"` in `speculative_config`.
@ -267,7 +267,7 @@ speculative decoding, breaking down the guarantees into three key areas:
    >   distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
    > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
    >   without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
-    >   provides a lossless guarantee. Almost all of the tests in <gh-dir:tests/spec_decode/e2e>.
+    >   provides a lossless guarantee. Almost all of the tests in [tests/spec_decode/e2e](../../tests/spec_decode/e2e).
    >   verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
 3. **vLLM Logprob Stability**
@ -289,4 +289,4 @@ For mitigation strategies, please refer to the FAQ entry *Can the output of a pr
 - [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
 - [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
 - [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
- [Dynamic speculative decoding](gh-issue:4565)
+- [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565)
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -298,7 +298,7 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
 Answer: x = -29/8
 ```
-An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/structured_outputs>
+An example of using `structural_tag` can be found here: [examples/online_serving/structured_outputs](../../examples/online_serving/structured_outputs)
 ## Offline Inference
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -151,9 +151,9 @@ Known issues:
   much shorter than what vLLM generates. Since an exception is thrown when this condition
   is not met, the following additional chat templates are provided:
-    * <gh-file:examples/tool_chat_template_mistral.jinja> - this is the "official" Mistral chat template, but tweaked so that
+    * [examples/tool_chat_template_mistral.jinja](../../examples/tool_chat_template_mistral.jinja) - this is the "official" Mistral chat template, but tweaked so that
      it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
-    * <gh-file:examples/tool_chat_template_mistral_parallel.jinja> - this is a "better" version that adds a tool-use system prompt
+    * [examples/tool_chat_template_mistral_parallel.jinja](../../examples/tool_chat_template_mistral_parallel.jinja) - this is a "better" version that adds a tool-use system prompt
      when tools are provided, that results in much better reliability when working with parallel tool calling.
 Recommended flags:
@ -187,16 +187,16 @@ Known issues:
 VLLM provides two JSON-based chat templates for Llama 3.1 and 3.2:
-* <gh-file:examples/tool_chat_template_llama3.1_json.jinja> - this is the "official" chat template for the Llama 3.1
+* [examples/tool_chat_template_llama3.1_json.jinja](../../examples/tool_chat_template_llama3.1_json.jinja) - this is the "official" chat template for the Llama 3.1
 models, but tweaked so that it works better with vLLM.
-* <gh-file:examples/tool_chat_template_llama3.2_json.jinja> - this extends upon the Llama 3.1 chat template by adding support for
+* [examples/tool_chat_template_llama3.2_json.jinja](../../examples/tool_chat_template_llama3.2_json.jinja) - this extends upon the Llama 3.1 chat template by adding support for
 images.
 Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
 VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pythonic tool calling is recommended:
-* <gh-file:examples/tool_chat_template_llama4_pythonic.jinja> - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
+* [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja) - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
 For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
@ -212,7 +212,7 @@ Supported models:
    Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
-    <gh-file:examples/tool_chat_template_granite.jinja>: this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
+    [examples/tool_chat_template_granite.jinja](../../examples/tool_chat_template_granite.jinja): this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
 * `ibm-granite/granite-3.1-8b-instruct`
@ -224,7 +224,7 @@ Supported models:
    Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
-    <gh-file:examples/tool_chat_template_granite_20b_fc.jinja>: this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
+    [examples/tool_chat_template_granite_20b_fc.jinja](../../examples/tool_chat_template_granite_20b_fc.jinja): this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
 ### InternLM Models (`internlm`)
@ -282,8 +282,8 @@ Flags: `--tool-call-parser hermes`
 Supported models:
-* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
+* `MiniMaxAi/MiniMax-M1-40k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
-* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
+* `MiniMaxAi/MiniMax-M1-80k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
 Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`
@ -291,8 +291,8 @@ Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_m
 Supported models:
-* `deepseek-ai/DeepSeek-V3-0324` (use with <gh-file:examples/tool_chat_template_deepseekv3.jinja>)
+* `deepseek-ai/DeepSeek-V3-0324` (use with [examples/tool_chat_template_deepseekv3.jinja](../../examples/tool_chat_template_deepseekv3.jinja))
-* `deepseek-ai/DeepSeek-R1-0528` (use with <gh-file:examples/tool_chat_template_deepseekr1.jinja>)
+* `deepseek-ai/DeepSeek-R1-0528` (use with [examples/tool_chat_template_deepseekr1.jinja](../../examples/tool_chat_template_deepseekr1.jinja))
 Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
@ -300,7 +300,7 @@ Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
 Supported models:
-* `deepseek-ai/DeepSeek-V3.1` (use with <gh-file:examples/tool_chat_template_deepseekv31.jinja>)
+* `deepseek-ai/DeepSeek-V3.1` (use with [examples/tool_chat_template_deepseekv31.jinja](../../examples/tool_chat_template_deepseekv31.jinja))
 Flags: `--tool-call-parser deepseek_v31 --chat-template {see_above}`
@ -379,12 +379,12 @@ Limitations:
 Example supported models:
-* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
+* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
-* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
+* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
-* `Team-ACE/ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
+* `Team-ACE/ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
-* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
+* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
-* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
+* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
-* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
+* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
 Flags: `--tool-call-parser pythonic --chat-template {see_above}`
@ -393,7 +393,7 @@ Flags: `--tool-call-parser pythonic --chat-template {see_above}`
 ## How to Write a Tool Parser Plugin
-A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in <gh-file:vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py>.
+A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py](../../vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py).
 Here is a summary of a plugin file:
--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -4,19 +4,19 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 === "Intel/AMD x86"
-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:installation"
 === "ARM AArch64"
-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:installation"
 === "Apple silicon"
-    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:installation"
 === "IBM Z (S390X)"
-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation"
 ## Requirements
@ -24,19 +24,19 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 === "Intel/AMD x86"
-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:requirements"
 === "ARM AArch64"
-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:requirements"
 === "Apple silicon"
-    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:requirements"
 === "IBM Z (S390X)"
-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:requirements"
 ## Set up using Python
@ -52,19 +52,19 @@ Currently, there are no pre-built CPU wheels.
 === "Intel/AMD x86"
-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
 === "ARM AArch64"
-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-wheel-from-source"
 === "Apple silicon"
-    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:build-wheel-from-source"
 === "IBM Z (s390x)"
-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:build-wheel-from-source"
 ## Set up using Docker
@ -72,24 +72,24 @@ Currently, there are no pre-built CPU wheels.
 === "Intel/AMD x86"
-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-images"
 ### Build image from source
 === "Intel/AMD x86"
-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-image-from-source"
 === "ARM AArch64"
-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-image-from-source"
 === "Apple silicon"
-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-image-from-source"
 === "IBM Z (S390X)"
-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:build-image-from-source"
 ## Related runtime environment variables
--- a/docs/getting_started/installation/cpu.s390x.inc.md
+++ b/docs/getting_started/installation/cpu.s390x.inc.md
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
--- a/docs/getting_started/installation/google_tpu.md
+++ b/docs/getting_started/installation/google_tpu.md
@ -153,11 +153,11 @@ VLLM_TARGET_DEVICE="tpu" python -m pip install -e .
 ### Pre-built images
-See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
+See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
 ### Build image from source
-You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
+You can use [docker/Dockerfile.tpu](../../../docker/Dockerfile.tpu) to build a Docker image with TPU support.
 ```bash
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@ -11,11 +11,11 @@ vLLM contains pre-compiled C++ and CUDA (12.8) binaries.
 # --8<-- [start:set-up-using-python]
 !!! note
-    PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+    PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <https://github.com/vllm-project/vllm/issues/8420> for more details.
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
-Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below][build-from-source] for more details.
+Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-wheel-from-source) for more details.
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
@ -44,8 +44,6 @@ export CUDA_VERSION=118 # or 126
 uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 [](){ #install-the-latest-code }
 #### Install the latest code
 LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
@ -128,11 +126,11 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vll
 uv pip install --editable .
 ```
-You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code].
+You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
 !!! note
    There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
-    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [install-the-latest-code][install-the-latest-code] for instructions on how to install a specified wheel.
+    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
 #### Full build (with compilation)
@ -250,7 +248,7 @@ uv pip install -e .
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
-See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image.
+See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
 Another way to access the latest code is to use the docker images:
@ -266,11 +264,11 @@ The latest code can contain bugs and may not be stable. Please use it with cauti
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
-See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
+See [Building vLLM's Docker Image from Source](../../deployment/docker.md#building-vllms-docker-image-from-source) for instructions on building the Docker image.
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]
-See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
+See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.
 # --8<-- [end:supported-features]
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@ -4,15 +4,15 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 === "NVIDIA CUDA"
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:installation"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:installation"
 === "AMD ROCm"
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:installation"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:installation"
 === "Intel XPU"
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:installation"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:installation"
 ## Requirements
@ -24,15 +24,15 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 === "NVIDIA CUDA"
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:requirements"
 === "AMD ROCm"
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:requirements"
 === "Intel XPU"
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:requirements"
 ## Set up using Python
@ -42,45 +42,43 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 === "NVIDIA CUDA"
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:set-up-using-python"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:set-up-using-python"
 === "AMD ROCm"
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:set-up-using-python"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:set-up-using-python"
 === "Intel XPU"
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:set-up-using-python"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
 ### Pre-built wheels
 === "NVIDIA CUDA"
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-wheels"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:pre-built-wheels"
 === "AMD ROCm"
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-wheels"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:pre-built-wheels"
 === "Intel XPU"
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-wheels"
 [](){ #build-from-source }
 ### Build wheel from source
 === "NVIDIA CUDA"
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:build-wheel-from-source"
 === "AMD ROCm"
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:build-wheel-from-source"
 === "Intel XPU"
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-wheel-from-source"
 ## Set up using Docker
@ -88,40 +86,40 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 === "NVIDIA CUDA"
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:pre-built-images"
 === "AMD ROCm"
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:pre-built-images"
 === "Intel XPU"
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-images"
 ### Build image from source
 === "NVIDIA CUDA"
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:build-image-from-source"
 === "AMD ROCm"
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:build-image-from-source"
 === "Intel XPU"
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-image-from-source"
 ## Supported features
 === "NVIDIA CUDA"
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:supported-features"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:supported-features"
 === "AMD ROCm"
-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:supported-features"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:supported-features"
 === "Intel XPU"
-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:supported-features"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:supported-features"
--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@ -146,7 +146,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO
 #### (Optional) Build an image with ROCm software stack
-Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
+Build a docker image from [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base) which setup ROCm software stack needed by the vLLM.
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.
@ -170,7 +170,7 @@ DOCKER_BUILDKIT=1 docker build \
 #### Build an image with vLLM
-First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
+First, build a docker image from [docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 ```bash
@ -181,10 +181,10 @@ It is important that the user kicks off the docker build using buildkit. Either
 }
 ```
-<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
+[docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base)
 - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
 Their values can be passed in when running `docker build` with `--build-arg` options.
@ -217,6 +217,6 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]
-See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
+See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.
 # --8<-- [end:supported-features]
--- a/Show More
+++ b/Show More