fix precommit

2025-06-18 21:17:43 -07:00
388 changed files with 6400 additions and 20677 deletions
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@ -16,7 +16,7 @@ Please download the visualization scripts in the post
  - Download `nightly-benchmarks.zip`.
  - In the same folder, run the following code:
-  ```bash
+  ```console
  export HF_TOKEN=<your HF token>
  apt update
  apt install -y git
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -102,7 +102,6 @@ steps:
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
@ -118,7 +117,6 @@ steps:
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -54,11 +54,10 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
       --name "${container_name}" \
       ${image_name} \
       /bin/bash -c "
            set -e; # Exit on first error
            python3 /workspace/vllm/examples/offline_inference/neuron.py;
            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
-                echo \"Running test file: \$f\";
+                echo 'Running test file: '$f;
                python3 -m pytest \$f -v --capture=tee-sys;
            done
       "
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -159,8 +159,6 @@ run_and_track_test 14 "test_tpu_qkv_linear.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
 run_and_track_test 15 "test_spmd_model_weight_loading.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 run_and_track_test 16 "test_kv_cache_update_kernel.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -28,5 +28,4 @@ docker run \
    sh -c '
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
 '
--- a/.buildkite/scripts/tpu/config_v6e_1.env
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@ -4,8 +4,8 @@ CONTAINER_NAME=vllm-tpu
 # vllm config
 MODEL=meta-llama/Llama-3.1-8B-Instruct
-MAX_NUM_SEQS=256
+MAX_NUM_SEQS=512
-MAX_NUM_BATCHED_TOKENS=1024
+MAX_NUM_BATCHED_TOKENS=512
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@ -68,7 +68,7 @@ docker run \
 echo "run script..."
 echo
-docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
+docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
 echo "copy result back..."
 VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -41,16 +41,6 @@ steps:
  # TODO: add `--strict` once warnings in docstrings are fixed
  - mkdocs build
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
  # in /vllm/tools/generate_nightly_torch_test.py
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
  commands:
  - bash standalone_tests/pytorch_nightly_dependency.sh
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
@ -99,7 +89,7 @@ steps:
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 - label: Chunked Prefill Test
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_chunked_prefill
@ -178,23 +168,6 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
 - label: EPLB Algorithm Test
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_algo.py
  commands:
  - pytest -v -s distributed/test_eplb_algo.py
 - label: EPLB Execution Test # 5min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
 - label: Metrics, Tracing Test # 10min
  mirror_hardwares: [amdexperimental, amdproduction]
  num_gpus: 2
@ -298,15 +271,6 @@ steps:
  commands:
    - pytest -v -s prefix_caching
 - label: Platform Tests (CUDA)
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
 - label: Samplers Test # 36min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
@ -642,18 +606,13 @@ steps:
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 - label: Distributed Tests (2 GPUs) # 40min
  mirror_hardwares: [amdexperimental]
@ -777,7 +736,7 @@ steps:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 - label: Weight Loading Multiple GPU Test - Large Models # optional
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental] 
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -18,10 +18,6 @@
 /vllm/entrypoints @aarnphm
 CMakeLists.txt @tlrmchlsmth
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb @aarnphm
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -45,7 +45,6 @@ pull_request_rules:
      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
      - files~=^vllm/model_executor/models/.*llama.*\.py
      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
      - title~=(?i)llama
  actions:
    label:
      add:
@ -66,19 +65,6 @@ pull_request_rules:
      add:
        - multi-modality
 - name: label-performance
  description: Automatically apply performance label
  conditions:
    - or:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
      - files~=^tests/benchmarks/
      - files~=^\.buildkite/nightly-benchmarks/
  actions:
    label:
      add:
        - performance
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
@ -88,6 +74,7 @@ pull_request_rules:
      - files~=^vllm/model_executor/models/.*qwen.*\.py
      - files~=^vllm/reasoning/.*qwen.*\.py
      - title~=(?i)Qwen
      - body~=(?i)Qwen
  actions:
    label:
      add:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -53,11 +53,6 @@ repos:
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
  - id: format-torch-nightly-test
    name: reformat nightly_torch_test.txt to be in sync with test.in
    language: python
    entry: python tools/generate_nightly_torch_test.py
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
    name: Run mypy for local Python installation
    entry: tools/mypy.sh 0 "local"
@ -120,11 +115,6 @@ repos:
    entry: python tools/check_spdx_header.py
    language: python
    types: [python]
  - id: check-root-lazy-imports
    name: Check root lazy imports
    entry: python tools/check_init_lazy_imports.py
    language: python
    types: [python]
  - id: check-filenames
    name: Check for spaces in all filenames
    entry: bash
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -513,7 +513,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
@ -548,7 +547,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -566,16 +566,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
  # moe_data.cu is used by all CUTLASS MoE kernels.
  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
  endif() 
  #
  # Machete kernels
@ -648,14 +638,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 # if CUDA endif
 endif()
 if (VLLM_GPU_LANG STREQUAL "HIP")
  # Add QuickReduce kernels
  list(APPEND VLLM_EXT_SRC
    "csrc/custom_quickreduce.cu"
  )
 # if ROCM endif
 endif()
 message(STATUS "Enabling C extension.")
 define_gpu_extension_target(
  _C
--- a/README.md
+++ b/README.md
@ -154,13 +154,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us
 <!-- --8<-- [start:contact-us] -->
 - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
 - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
 - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
 - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
 - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
 <!-- --8<-- [end:contact-us] -->
 ## Media Kit
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
 datasets supported on vLLM. It’s a living document, updated as new features and datasets
 become available.
-**Dataset Overview**
+## Dataset Overview
 <table style="width:100%; border-collapse: collapse;">
  <thead>
@ -82,10 +82,7 @@ become available.
 **Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
 ---
-<details>
+## Example - Online Benchmark
 <summary><b>🚀 Example - Online Benchmark</b></summary>
 <br/>
 First start serving your model
@ -133,8 +130,7 @@ P99 ITL (ms):                            8.39
 ==================================================
 ```
-**Custom Dataset**
+### Custom Dataset
 If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
 ```
@ -166,7 +162,7 @@ python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detaile
 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
-**VisionArena Benchmark for Vision Language Models**
+### VisionArena Benchmark for Vision Language Models
 ```bash
 # need a model with vision capability here
@ -184,7 +180,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
  --num-prompts 1000
 ```
-**InstructCoder Benchmark with Speculative Decoding**
+### InstructCoder Benchmark with Speculative Decoding
 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@ -201,7 +197,7 @@ python3 benchmarks/benchmark_serving.py \
    --num-prompts 2048
 ```
-**Other HuggingFaceDataset Examples**
+### Other HuggingFaceDataset Examples
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
@ -255,7 +251,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
    --num-prompts 80
 ```
-**Running With Sampling Parameters**
+### Running With Sampling Parameters
 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
@ -273,27 +269,8 @@ python3 vllm/benchmarks/benchmark_serving.py \
  --num-prompts 10
 ```
-**Running With Ramp-Up Request Rate**
+---
-
+## Example - Offline Throughput Benchmark
 The benchmark tool also supports ramping up the request rate over the
 duration of the benchmark run. This can be useful for stress testing the
 server or finding the maximum throughput that it can handle, given some latency budget.
 Two ramp-up strategies are supported:
 - `linear`: Increases the request rate linearly from a start value to an end value.
 - `exponential`: Increases the request rate exponentially.
 The following arguments can be used to control the ramp-up:
 - `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
 - `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
 - `--ramp-up-end-rps`: The request rate at the end of the benchmark.
 </details>
 <details>
 <summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
 <br/>
 ```bash
 python3 vllm/benchmarks/benchmark_throughput.py \
@ -311,7 +288,7 @@ Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```
-**VisionArena Benchmark for Vision Language Models**
+### VisionArena Benchmark for Vision Language Models
 ``` bash
 python3 vllm/benchmarks/benchmark_throughput.py \
@ -331,7 +308,7 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
-**InstructCoder Benchmark with Speculative Decoding**
+### InstructCoder Benchmark with Speculative Decoding
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
@ -355,7 +332,7 @@ Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```
-**Other HuggingFaceDataset Examples**
+### Other HuggingFaceDataset Examples
 **`lmms-lab/LLaVA-OneVision-Data`**
@ -394,7 +371,7 @@ python3 benchmarks/benchmark_throughput.py \
  --num-prompts 10
 ```
-**Benchmark with LoRA Adapters**
+### Benchmark with LoRA Adapters
 ``` bash
 # download dataset
@ -410,196 +387,3 @@ python3 vllm/benchmarks/benchmark_throughput.py \
  --enable-lora \
  --lora-path yard1/llama-2-7b-sql-lora-test
  ```
 </details>
 <details>
 <summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
 <br/>
 Benchmark the performance of structured output generation (JSON, grammar, regex).
 **Server Setup**
 ```bash
 vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
 ```
 **JSON Schema Benchmark**
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --dataset json \
  --structured-output-ratio 1.0 \
  --request-rate 10 \
  --num-prompts 1000
 ```
 **Grammar-based Generation Benchmark**
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --dataset grammar \
  --structure-type grammar \
  --request-rate 10 \
  --num-prompts 1000
 ```
 **Regex-based Generation Benchmark**
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --dataset regex \
  --request-rate 10 \
  --num-prompts 1000
 ```
 **Choice-based Generation Benchmark**
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --dataset choice \
  --request-rate 10 \
  --num-prompts 1000
 ```
 **XGrammar Benchmark Dataset**
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --dataset xgrammar_bench \
  --request-rate 10 \
  --num-prompts 1000
 ```
 </details>
 <details>
 <summary><b>📚 Example - Long Document QA Benchmark</b></summary>
 <br/>
 Benchmark the performance of long document question-answering with prefix caching.
 **Basic Long Document QA Test**
 ```bash
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
  --model meta-llama/Llama-2-7b-chat-hf \
  --enable-prefix-caching \
  --num-documents 16 \
  --document-length 2000 \
  --output-len 50 \
  --repeat-count 5
 ```
 **Different Repeat Modes**
 ```bash
 # Random mode (default) - shuffle prompts randomly
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
  --model meta-llama/Llama-2-7b-chat-hf \
  --enable-prefix-caching \
  --num-documents 8 \
  --document-length 3000 \
  --repeat-count 3 \
  --repeat-mode random
 # Tile mode - repeat entire prompt list in sequence
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
  --model meta-llama/Llama-2-7b-chat-hf \
  --enable-prefix-caching \
  --num-documents 8 \
  --document-length 3000 \
  --repeat-count 3 \
  --repeat-mode tile
 # Interleave mode - repeat each prompt consecutively
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
  --model meta-llama/Llama-2-7b-chat-hf \
  --enable-prefix-caching \
  --num-documents 8 \
  --document-length 3000 \
  --repeat-count 3 \
  --repeat-mode interleave
 ```
 </details>
 <details>
 <summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
 <br/>
 Benchmark the efficiency of automatic prefix caching.
 **Fixed Prompt with Prefix Caching**
 ```bash
 python3 benchmarks/benchmark_prefix_caching.py \
  --model meta-llama/Llama-2-7b-chat-hf \
  --enable-prefix-caching \
  --num-prompts 1 \
  --repeat-count 100 \
  --input-length-range 128:256
 ```
 **ShareGPT Dataset with Prefix Caching**
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 python3 benchmarks/benchmark_prefix_caching.py \
  --model meta-llama/Llama-2-7b-chat-hf \
  --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
  --enable-prefix-caching \
  --num-prompts 20 \
  --repeat-count 5 \
  --input-length-range 128:256
 ```
 </details>
 <details>
 <summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
 <br/>
 Benchmark the performance of request prioritization in vLLM.
 **Basic Prioritization Test**
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
  --model meta-llama/Llama-2-7b-chat-hf \
  --input-len 128 \
  --output-len 64 \
  --num-prompts 100 \
  --scheduling-policy priority
 ```
 **Multiple Sequences per Prompt**
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
  --model meta-llama/Llama-2-7b-chat-hf \
  --input-len 128 \
  --output-len 64 \
  --num-prompts 100 \
  --scheduling-policy priority \
  --n 2
 ```
 </details>
--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@ -10,7 +10,6 @@
 # 3. Set variables (ALL REQUIRED)
 #   BASE: your directory for vllm repo
 #   MODEL: the model served by vllm
 #   SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
 #   TP: ways of tensor parallelism
 #   DOWNLOAD_DIR: directory to download and load model weights.
 #   INPUT_LEN: request input len
@ -35,7 +34,6 @@
 TAG=$(date +"%Y_%m_%d_%H_%M")
 BASE=""
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
 SYSTEM="TPU"
 TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
@ -47,15 +45,12 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 PROFILE_PATH="$LOG_FOLDER/profile"
 echo "result file: $RESULT"
 echo "model: $MODEL"
 rm -rf $LOG_FOLDER
 rm -rf $PROFILE_PATH
 mkdir -p $LOG_FOLDER
 mkdir -p $PROFILE_PATH
 cd "$BASE/vllm"
@ -75,11 +70,10 @@ start_server() {
    local max_num_seqs=$2
    local max_num_batched_tokens=$3
    local vllm_log=$4
    local profile_dir=$5
    pkill -f vllm
-    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
+    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
        --disable-log-requests \
        --port 8004 \
        --gpu-memory-utilization $gpu_memory_utilization \
@ -111,37 +105,19 @@ start_server() {
    fi
 }
 update_best_profile() {
    local profile_dir=$1
    local profile_index=$2
    sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
    selected_profile_file=
    if [[ "$SYSTEM" == "TPU" ]]; then
        selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
    fi 
    if [[ "$SYSTEM" == "GPU" ]]; then
        selected_profile_file="${sorted_paths[$profile_index]}"
    fi 
    rm -f $PROFILE_PATH/*
    cp $selected_profile_file $PROFILE_PATH
 }
 run_benchmark() {
    local max_num_seqs=$1
    local max_num_batched_tokens=$2
    local gpu_memory_utilization=$3
    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
    local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
    mkdir -p $profile_dir
    pkill -f vllm
    local profile_index=0
    echo "starting server..."
-    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
    result=$?
    if [[ "$result" -eq 1 ]]; then
        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@ -168,8 +144,7 @@ run_benchmark() {
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
-        --port 8004 \
+        --port 8004 &> "$bm_log"
        --profile &> "$bm_log"
    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@ -183,7 +158,6 @@ run_benchmark() {
    # start from request-rate as int(throughput) + 1
        request_rate=$((${throughput%.*} + 1))
        while ((request_rate > 0)); do
            profile_index=$((profile_index+1))
            # clear prefix cache
            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
            sleep 5
@ -221,12 +195,6 @@ run_benchmark() {
            best_max_num_seqs=$max_num_seqs
            best_num_batched_tokens=$max_num_batched_tokens
            best_goodput=$goodput
            if [[ "$SYSTEM" == "TPU" ]]; then
                update_best_profile "$profile_dir/plugins/profile" $profile_index
            fi
            if [[ "$SYSTEM" == "GPU" ]]; then
                update_best_profile "$profile_dir" $profile_index
            fi
        fi
    else
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@ -271,6 +239,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
    done
 done
 echo "finish permutations"
-echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
-echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -404,14 +404,8 @@ async def async_request_openai_chat_completions(
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
                        chunk_bytes = chunk_bytes.decode("utf-8")
                        # NOTE: SSE comments (often used as pings) start with a colon.
                        # These are not JSON data payload and should be skipped.
                        if chunk_bytes.startswith(":"):
                            continue
                        chunk = chunk_bytes.removeprefix("data: ")
                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                        if chunk != "[DONE]":
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -349,12 +349,11 @@ class RandomDataset(BenchmarkDataset):
            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
            # To avoid uncontrolled change of the prompt length,
            # the encoded sequence is truncated before being decode again.
            total_input_len = prefix_len + int(input_lens[i])
            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
-                :total_input_len
+                : input_lens[i]
            ]
            prompt = tokenizer.decode(re_encoded_sequence)
-            total_input_len = len(re_encoded_sequence)
+            total_input_len = prefix_len + int(input_lens[i])
            requests.append(
                SampleRequest(
                    prompt=prompt,
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -33,7 +33,7 @@ import warnings
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, Literal, Optional
+from typing import Any, Optional
 import numpy as np
 from tqdm.asyncio import tqdm
@ -107,42 +107,14 @@ class BenchmarkMetrics:
    percentiles_e2el_ms: list[tuple[float, float]]
 def _get_current_request_rate(
    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
    ramp_up_start_rps: Optional[int],
    ramp_up_end_rps: Optional[int],
    request_index: int,
    total_requests: int,
    request_rate: float,
 ) -> float:
    if (
        ramp_up_strategy
        and ramp_up_start_rps is not None
        and ramp_up_end_rps is not None
    ):
        progress = request_index / max(total_requests - 1, 1)
        if ramp_up_strategy == "linear":
            increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
            return ramp_up_start_rps + increase
        elif ramp_up_strategy == "exponential":
            ratio = ramp_up_end_rps / ramp_up_start_rps
            return ramp_up_start_rps * (ratio**progress)
        else:
            raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
    return request_rate
 async def get_request(
    input_requests: list[SampleRequest],
    request_rate: float,
    burstiness: float = 1.0,
-    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+) -> AsyncGenerator[SampleRequest, None]:
    ramp_up_start_rps: Optional[int] = None,
    ramp_up_end_rps: Optional[int] = None,
 ) -> AsyncGenerator[tuple[SampleRequest, float], None]:
    """
    Asynchronously generates requests at a specified rate
-    with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
+    with OPTIONAL burstiness.
    Args:
        input_requests:
@ -157,44 +129,22 @@ async def get_request(
            A lower burstiness value (0 < burstiness < 1) results
            in more bursty requests, while a higher burstiness value
            (burstiness > 1) results in a more uniform arrival of requests.
         ramp_up_strategy (optional):
            The ramp-up strategy. Can be "linear" or "exponential".
            If None, uses constant request rate (specified by request_rate).
        ramp_up_start_rps (optional):
            The starting request rate for ramp-up.
        ramp_up_end_rps (optional):
            The ending request rate for ramp-up.
    """
    input_requests: Iterable[SampleRequest] = iter(input_requests)
    # Calculate scale parameter theta to maintain the desired request_rate.
    assert burstiness > 0, (
        f"A positive burstiness factor is expected, but given {burstiness}."
    )
-    # Convert to list to get length for ramp-up calculations
+    theta = 1.0 / (request_rate * burstiness)
    if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
        input_requests = list(input_requests)
    total_requests = len(input_requests)
    request_index = 0
    for request in input_requests:
-        current_request_rate = _get_current_request_rate(
+        yield request
            ramp_up_strategy,
            ramp_up_start_rps,
            ramp_up_end_rps,
            request_index,
            total_requests,
            request_rate,
        )
-        yield request, current_request_rate
+        if request_rate == float("inf"):
        request_index += 1
        if current_request_rate == float("inf"):
            # If the request rate is infinity, then we don't need to wait.
            continue
        theta = 1.0 / (current_request_rate * burstiness)
        # Sample the request interval from the gamma distribution.
        # If burstiness is 1, it follows exponential distribution.
        interval = np.random.gamma(shape=burstiness, scale=theta)
@ -340,9 +290,6 @@ async def benchmark(
    max_concurrency: Optional[int],
    lora_modules: Optional[Iterable[str]],
    extra_body: Optional[dict],
    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
    ramp_up_start_rps: Optional[int] = None,
    ramp_up_end_rps: Optional[int] = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -406,15 +353,7 @@ async def benchmark(
    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
-    if ramp_up_strategy is not None:
+    print(f"Traffic request rate: {request_rate}")
        print(
            f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
            f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
            "the duration of the benchmark."
        )
    else:
        print(f"Traffic request rate: {request_rate} RPS.")
    print(f"Burstiness factor: {burstiness} ({distribution})")
    print(f"Maximum request concurrency: {max_concurrency}")
@ -434,34 +373,7 @@ async def benchmark(
    benchmark_start_time = time.perf_counter()
    tasks: list[asyncio.Task] = []
-
+    async for request in get_request(input_requests, request_rate, burstiness):
    rps_change_events = []
    last_int_rps = -1
    if ramp_up_strategy is not None and ramp_up_start_rps is not None:
        last_int_rps = ramp_up_start_rps
        rps_change_events.append(
            {
                "rps": last_int_rps,
                "timestamp": datetime.now().isoformat(),
            }
        )
    async for request, current_request_rate in get_request(
        input_requests,
        request_rate,
        burstiness,
        ramp_up_strategy,
        ramp_up_start_rps,
        ramp_up_end_rps,
    ):
        if ramp_up_strategy is not None:
            current_int_rps = int(current_request_rate)
            if current_int_rps > last_int_rps:
                timestamp = datetime.now().isoformat()
                for rps_val in range(last_int_rps + 1, current_int_rps + 1):
                    rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
                last_int_rps = current_int_rps
        prompt, prompt_len, output_len, mm_content = (
            request.prompt,
            request.prompt_len,
@ -485,8 +397,11 @@ async def benchmark(
            ignore_eos=ignore_eos,
            extra_body=extra_body,
        )
-        task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
+        tasks.append(
-        tasks.append(asyncio.create_task(task))
+            asyncio.create_task(
                limited_request_func(request_func_input=request_func_input, pbar=pbar)
            )
        )
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
    if profile:
@ -562,9 +477,6 @@ async def benchmark(
        "errors": [output.error for output in outputs],
    }
    if rps_change_events:
        result["rps_change_events"] = rps_change_events
    def process_one_metric(
        # E.g., "ttft"
        metric_attribute_name: str,
@ -698,26 +610,6 @@ def main(args: argparse.Namespace):
    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
    tokenizer_mode = args.tokenizer_mode
    # Validate ramp-up arguments
    if args.ramp_up_strategy is not None:
        if args.request_rate != float("inf"):
            raise ValueError(
                "When using ramp-up, do not specify --request-rate. "
                "The request rate will be controlled by ramp-up parameters. "
                "Please remove the --request-rate argument."
            )
        if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
            raise ValueError(
                "When using --ramp-up-strategy, both --ramp-up-start-rps and "
                "--ramp-up-end-rps must be specified"
            )
        if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
            raise ValueError("Ramp-up start and end RPS must be non-negative")
        if args.ramp_up_start_rps > args.ramp_up_end_rps:
            raise ValueError("Ramp-up start RPS must be less than end RPS")
        if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
            raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
    if args.base_url is not None:
        api_url = f"{args.base_url}{args.endpoint}"
        base_url = f"{args.base_url}"
@ -910,9 +802,6 @@ def main(args: argparse.Namespace):
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
            extra_body=sampling_params,
            ramp_up_strategy=args.ramp_up_strategy,
            ramp_up_start_rps=args.ramp_up_start_rps,
            ramp_up_end_rps=args.ramp_up_end_rps,
        )
    )
@ -945,11 +834,6 @@ def main(args: argparse.Namespace):
        result_json["burstiness"] = args.burstiness
        result_json["max_concurrency"] = args.max_concurrency
        if args.ramp_up_strategy is not None:
            result_json["ramp_up_strategy"] = args.ramp_up_strategy
            result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
            result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
        # Merge with benchmark result
        result_json = {**result_json, **benchmark_result}
@ -975,10 +859,7 @@ def main(args: argparse.Namespace):
            if args.max_concurrency is not None
            else ""
        )
-        if args.ramp_up_strategy is not None:
+        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
            file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
        else:
            file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
        if args.result_filename:
            file_name = args.result_filename
        if args.result_dir:
@ -1344,31 +1225,6 @@ def create_argument_parser():
        "script chooses a LoRA module at random.",
    )
    parser.add_argument(
        "--ramp-up-strategy",
        type=str,
        default=None,
        choices=["linear", "exponential"],
        help="The ramp-up strategy. This would be used to "
        "ramp up the request rate from initial RPS to final "
        "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
        "over the duration of the benchmark.",
    )
    parser.add_argument(
        "--ramp-up-start-rps",
        type=int,
        default=None,
        help="The starting request rate for ramp-up (RPS). "
        "Needs to be specified when --ramp-up-strategy is used.",
    )
    parser.add_argument(
        "--ramp-up-end-rps",
        type=int,
        default=None,
        help="The ending request rate for ramp-up (RPS). "
        "Needs to be specified when --ramp-up-strategy is used.",
    )
    return parser
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -97,7 +97,7 @@ def run_vllm(
        assert lora_requests is None, "BeamSearch API does not support LoRA"
        prompts = [request.prompt for request in requests]
        # output_len should be the same for all requests.
-        output_len = requests[0].expected_output_len
+        output_len = requests[0][2]
        for request in requests:
            assert request.expected_output_len == output_len
        start = time.perf_counter()
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -19,7 +19,7 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    w8a8_block_fp8_matmul,
 )
-from vllm.utils import FlexibleArgumentParser, cdiv
+from vllm.utils import FlexibleArgumentParser
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@ -117,9 +117,14 @@ def bench_fp8(
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32)
+    def ceil_div(x: int, y: int) -> int:
        return (x + y - 1) // y
    block_scale_a = torch.rand(
        (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
    )
    block_scale_b = torch.rand(
-        cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32
+        ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32
    )
    block_scale_a_M_major = block_scale_a.t().contiguous().t()
    block_scale_b_K_major = block_scale_b.t().contiguous().t()
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -22,16 +22,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    MARLIN_SUPPORTED_GROUP_SIZES,
    query_marlin_supported_quant_types,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
    FP4_MARLIN_SUPPORTED_GROUP_SIZES,
    rand_marlin_weight_fp4_like,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
    marlin_quant_fp8_torch,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    MarlinWorkspace,
    awq_marlin_quantize,
    marlin_quantize,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
@ -43,7 +35,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    quantize_weights,
    sort_weights,
 )
-from vllm.scalar_type import ScalarType, scalar_types
+from vllm.scalar_type import ScalarType
 from vllm.utils import FlexibleArgumentParser
 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@ -65,144 +57,80 @@ def bench_run(
    size_n: int,
 ):
    label = "Quant Matmul"
    sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format(
        model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n
    )
    print(f"Testing: {sub_label}")
    a = torch.randn(size_m, size_k).to(torch.half).cuda()
    b = torch.rand(size_k, size_n).to(torch.half).cuda()
    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
    if act_order and (group_size == -1 or group_size == size_k or has_zp):
        return
    if size_k % group_size != 0:
        return
-    marlin_24_supported = (
+    a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda()
        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
        and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
    )
    repack_supported = (
        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
        and group_size in MARLIN_SUPPORTED_GROUP_SIZES
    )
    allspark_supported = (
        quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
        and group_size == -1
        and not act_order
        and is_k_full
    )
    def gen_marlin_params():
        # Marlin quant
        marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None
        if quant_type == scalar_types.float4_e2m1f:
            if group_size != 16 or act_order:
                return
            marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
                b.T, group_size
            )
        elif quant_type == scalar_types.float8_e4m3fn:
            if group_size not in [-1, 128] or act_order:
                return
            marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size)
        elif group_size == 16:
            return
        elif has_zp:
            marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
                b, quant_type, group_size
            )
        else:
            marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = (
                marlin_quantize(b, quant_type, group_size, act_order)
            )
        return (
            marlin_w_ref,
            marlin_q_w,
            marlin_s,
            marlin_s2,
            marlin_zp,
            marlin_g_idx,
            marlin_sort_indices,
        )
    def gen_marlin_24_params():
        marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
        if marlin_24_supported:
            (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
                marlin_24_quantize(b, quant_type, group_size)
            )
        return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
    def gen_repack_params():
        q_w_gptq = None
        repack_sort_indices = None
        if repack_supported:
            (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
                b, quant_type, group_size, act_order
            )
            q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
            # For act_order, sort the "weights" and "g_idx"
            # so that group ids are increasing
            repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
            if act_order:
                (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
        return q_w_gptq, repack_sort_indices
    def gen_allspark_params():
        qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = (
            CUBLAS_M_THRESHOLD
        ) = None
        nonlocal allspark_supported
        if allspark_supported:
            properties = torch.cuda.get_device_properties(b.device.index)
            sm_count = properties.multi_processor_count
            sm_version = properties.major * 10 + properties.minor
            supported_arch = sm_version >= 80 and sm_version < 90
            allspark_supported = allspark_supported and supported_arch
            if supported_arch:
                w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
                qw = qw.to(torch.uint8)
                qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
                    qw, s, zp, has_zp
                )
                CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
        return (
            qw_reorder,
            s_reorder,
            zp_reorder,
            sm_count,
            sm_version,
            CUBLAS_M_THRESHOLD,
        )
    # Marlin quant
    (
        marlin_w_ref,
        marlin_q_w,
        marlin_s,
        marlin_s2,
        marlin_zp,
        marlin_g_idx,
        marlin_sort_indices,
-    ) = gen_marlin_params()
+        marlin_rand_perm,
-    marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
+    ) = marlin_quantize(b, quant_type, group_size, act_order)
-        gen_marlin_24_params()
+
    # Marlin_24 quant
    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
        marlin_24_quantize(b, quant_type, group_size)
    )
-    q_w_gptq, repack_sort_indices = gen_repack_params()
+
-    qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
+    marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
-        gen_allspark_params()
+
    # GPTQ quant
    (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
        b, quant_type, group_size, act_order
    )
    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
    # For act_order, sort the "weights" and "g_idx"
    # so that group ids are increasing
    repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
    if act_order:
        (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
    # Prepare
    marlin_workspace = MarlinWorkspace(
        size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
    )
    marlin_24_workspace = MarlinWorkspace(
        size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
    )
    marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
    # AllSpark W8A16 quant
    as_supported_case = (
        quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
        and group_size == -1
        and not act_order
        and is_k_full
    )
    if as_supported_case:
        properties = torch.cuda.get_device_properties(b.device.index)
        sm_count = properties.multi_processor_count
        sm_version = properties.major * 10 + properties.minor
        supported_arch = sm_version >= 80 and sm_version < 90
        as_supported_case = as_supported_case and supported_arch
        if supported_arch:
            has_zp = False
            w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
            qw = qw.to(torch.uint8)
            qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
                qw, s, zp, has_zp
            )
            CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
    globals = {
        # Gen params
@ -212,14 +140,15 @@ def bench_run(
        "size_n": size_n,
        "size_k": size_k,
        "a": a,
        "a_tmp": a_tmp,
        # Marlin params
        "marlin_w_ref": marlin_w_ref,
        "marlin_q_w": marlin_q_w,
        "marlin_s": marlin_s,
        "marlin_s2": marlin_s2,
        "marlin_zp": marlin_zp,
        "marlin_g_idx": marlin_g_idx,
        "marlin_sort_indices": marlin_sort_indices,
        "marlin_rand_perm": marlin_rand_perm,
        "marlin_workspace": marlin_workspace,
        "is_k_full": is_k_full,
        # Marlin_24 params
@ -232,12 +161,12 @@ def bench_run(
        "q_w_gptq": q_w_gptq,
        "repack_sort_indices": repack_sort_indices,
        # AllSpark W8A16 params
-        "qw_reorder": qw_reorder,
+        "qw_reorder": qw_reorder if as_supported_case else None,
-        "s_reorder": s_reorder,
+        "s_reorder": s_reorder if as_supported_case else None,
-        "zp_reorder": zp_reorder,
+        "zp_reorder": zp_reorder if as_supported_case else None,
-        "sm_count": sm_count,
+        "sm_count": sm_count if as_supported_case else None,
-        "sm_version": sm_version,
+        "sm_version": sm_version if as_supported_case else None,
-        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
+        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None,
        # Kernels
        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
@ -248,7 +177,7 @@ def bench_run(
    min_run_time = 1
    # Warmup pytorch
-    for _ in range(5):
+    for i in range(5):
        torch.matmul(a, marlin_w_ref)
    results.append(
@ -263,17 +192,17 @@ def bench_run(
    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="gptq_marlin_gemm",
+            description="gptq_marlin_gemm_fp16",
        ).blocked_autorange(min_run_time=min_run_time)
    )
    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@ -281,7 +210,10 @@ def bench_run(
        ).blocked_autorange(min_run_time=min_run_time)
    )
-    if marlin_24_supported:
+    if (
        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
        and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
    ):
        results.append(
            benchmark.Timer(
                stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
@ -292,18 +224,17 @@ def bench_run(
            ).blocked_autorange(min_run_time=min_run_time)
        )
-    if repack_supported:
+    results.append(
-        results.append(
+        benchmark.Timer(
-            benchmark.Timer(
+            stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
-                stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
+            globals=globals,
-                globals=globals,
+            label=label,
-                label=label,
+            sub_label=sub_label,
-                sub_label=sub_label,
+            description="gptq_marlin_repack",
-                description="gptq_marlin_repack",
+        ).blocked_autorange(min_run_time=min_run_time)
-            ).blocked_autorange(min_run_time=min_run_time)
+    )
        )
-    if allspark_supported:
+    if as_supported_case:
        results.append(
            benchmark.Timer(
                stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)",  # noqa: E501
@ -319,6 +250,7 @@ def main(args):
    print("Benchmarking models:")
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
    results: list[benchmark.Measurement] = []
    for model in args.models:
@ -346,17 +278,14 @@ def main(args):
                    ):
                        continue
-                    for quant_type in query_marlin_supported_quant_types():
+                    for quant_type in query_marlin_supported_quant_types(False):
                        if (
                            len(args.limit_num_bits) > 0
                            and quant_type.size_bits not in args.limit_num_bits
                        ):
                            continue
-                        for group_size in (
+                        for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
                            MARLIN_SUPPORTED_GROUP_SIZES
                            + FP4_MARLIN_SUPPORTED_GROUP_SIZES
                        ):
                            if (
                                len(args.limit_group_size) > 0
                                and group_size not in args.limit_group_size
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -85,6 +85,12 @@ def benchmark_shape(m: int,
    # === DeepGEMM Implementation ===
    def deepgemm_gemm():
        # A quantization is inside the loop as it depends on activations
        # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
        # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
        #     A, block_size[1])
        # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
        # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
        deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
                                       (B_deepgemm, B_scale_deepgemm),
                                       C_deepgemm)
@ -92,6 +98,8 @@ def benchmark_shape(m: int,
    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
        # A quantization is inside the loop as it depends on activations
        # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
        return w8a8_block_fp8_matmul(A_vllm,
                                     B_vllm,
                                     A_scale_vllm,
@ -101,6 +109,9 @@ def benchmark_shape(m: int,
    # === vLLM CUTLASS Implementation ===
    def vllm_cutlass_gemm():
        # A quantization is inside the loop as it depends on activations
        # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
        #     A, block_size[1], column_major_scales=True)
        return ops.cutlass_scaled_mm(A_vllm_cutlass,
                                     B_vllm.T,
                                     scale_a=A_scale_vllm_cutlass,
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 5f3644181c7a15345ce20bfc65af117d3601b524
+          GIT_TAG 763ad155a1c826f71ff318f41edb1e4e5e376ddb
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@ -207,7 +207,7 @@ void cutlass_mla_decode_sm100a(torch::Tensor const& out,
              "page_table must be a 32-bit integer tensor");
  auto in_dtype = q_nope.dtype();
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
+  at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
  const cudaStream_t stream =
      at::cuda::getCurrentCUDAStream(q_nope.get_device());
  if (in_dtype == at::ScalarType::Half) {
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -131,19 +131,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Quantization
 #ifdef __AVX512F__
  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
-      "Tensor? azp) -> ()",
+      "Tensor? azp) -> ()");
      {stride_tag});
  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
  // Compute int8 quantized tensor and scaling factor
  ops.def(
      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
-      "Tensor!? azp) -> ()",
+      "Tensor!? azp) -> ()");
      {stride_tag});
  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
           &dynamic_scaled_int8_quant);
  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
@ -151,8 +148,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def(
      "cutlass_scaled_mm(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()",
+      "                  Tensor b_scales, Tensor? bias) -> ()");
      {stride_tag});
  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
  // quantization.
@ -160,8 +156,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
      "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()",
+      "                  Tensor? azp, Tensor? bias) -> ()");
      {stride_tag});
  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #elif defined(__powerpc64__)
  // Compute int8 quantized tensor for given scaling factor.
--- a/csrc/custom_quickreduce.cu
+++ b/csrc/custom_quickreduce.cu
@ -1,114 +0,0 @@
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 #include <torch/all.h>
 #ifdef USE_ROCM
  #include "quickreduce/quick_reduce.h"
 quickreduce::fptr_t init_custom_qr(int64_t rank, int64_t world_size,
                                   std::optional<int64_t> qr_max_size) {
  if (world_size > 8)
    throw std::invalid_argument("world size > 8 is not supported");
  if (world_size == 6)
    throw std::invalid_argument("world size == 6 is not supported");
  if (world_size % 2 != 0)
    throw std::invalid_argument("Odd num gpus is not supported for now");
  if (rank < 0 || rank >= world_size)
    throw std::invalid_argument("invalid rank passed in");
  quickreduce::DeviceComms* fptr = new quickreduce::DeviceComms();
  fptr->init(world_size, rank, qr_max_size);
  return (quickreduce::fptr_t)fptr;
 }
 void qr_destroy(quickreduce::fptr_t _fa) {
  if (_fa) {
    auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
    fa->destroy();
    delete fa;
  }
 }
 torch::Tensor qr_get_handle(quickreduce::fptr_t _fa) {
  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
  hipIpcMemHandle_t handle = fa->get_handle();
  auto options =
      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
  auto data_handle =
      torch::empty({static_cast<int64_t>(sizeof(hipIpcMemHandle_t))}, options);
  std::memcpy(data_handle.data_ptr(), &handle, sizeof(hipIpcMemHandle_t));
  return data_handle;
 }
 void qr_open_handles(quickreduce::fptr_t _fa,
                     const std::vector<torch::Tensor>& handles) {
  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
  std::vector<hipIpcMemHandle_t> ipc_handles;
  ipc_handles.reserve(handles.size());
  for (auto& handle : handles) {
    // Ensure the tensor is on the same device as the current device.
    hipIpcMemHandle_t ipc_handle;
    std::memcpy(&ipc_handle, handle.data_ptr(), sizeof(hipIpcMemHandle_t));
    ipc_handles.push_back(ipc_handle);
  }
  fa->open_ipc_handles(ipc_handles);
 }
 void qr_all_reduce(quickreduce::fptr_t _fa, torch::Tensor& inp,
                   torch::Tensor& out, int64_t quant_level, bool cast_bf2half) {
  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
  auto stream = at::cuda::getCurrentHIPStreamMasqueradingAsCUDA();
  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
  TORCH_CHECK_EQ(inp.numel(), out.numel());
  TORCH_CHECK_LE(out.numel(), fa->kMaxProblemSize);
  if (out.scalar_type() == at::ScalarType::Half) {
    fa->allreduce<half, false>(reinterpret_cast<half*>(inp.data_ptr()),
                               reinterpret_cast<half*>(out.data_ptr()),
                               out.numel(), quant_level, stream);
  } else if (out.scalar_type() == at::ScalarType::BFloat16) {
    if (cast_bf2half) {
      fa->allreduce<half, true>(reinterpret_cast<half*>(inp.data_ptr()),
                                reinterpret_cast<half*>(out.data_ptr()),
                                out.numel(), quant_level, stream);
    } else {
      fa->allreduce<quickreduce::nv_bfloat16, false>(
          reinterpret_cast<quickreduce::nv_bfloat16*>(inp.data_ptr()),
          reinterpret_cast<quickreduce::nv_bfloat16*>(out.data_ptr()),
          out.numel(), quant_level, stream);
    }
  } else {
    throw std::runtime_error(
        "quick allreduce only supports float16 and bfloat16");
  }
 }
 int64_t qr_max_size() {
  // The default is 2GB (2,147,483,648 bytes)
  return static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
 }
  #define INSTANTIATE_FOR_WORLDSIZE(T, Codec, cast_bf2half)       \
    template struct quickreduce::AllReduceTwoshot<T, Codec<T, 2>, \
                                                  cast_bf2half>;  \
    template struct quickreduce::AllReduceTwoshot<T, Codec<T, 4>, \
                                                  cast_bf2half>;  \
    template struct quickreduce::AllReduceTwoshot<T, Codec<T, 8>, cast_bf2half>;
 INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, false)
 INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, false)
 INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, false)
 INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, false)
 INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, true)
 INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, true)
 INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, true)
 INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, true)
 INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecFP, false)
 INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ4, false)
 INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ6, false)
 INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ8, false)
 #endif  // USE_ROCM
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@ -185,7 +185,9 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
        params.conv_states_ptr = nullptr;
    }
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    // Otherwise the kernel will be launched from cuda:0 device
    // Cast to char to avoid compiler warning about narrowing
    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
            causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
@ -276,7 +278,9 @@ void causal_conv1d_update(const at::Tensor &x,
        params.conv_state_indices_ptr = nullptr;
    }
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    // Otherwise the kernel will be launched from cuda:0 device
    // Cast to char to avoid compiler warning about narrowing
    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] {
            causal_conv1d_update_cuda<input_t, weight_t>(params, stream);
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -647,7 +647,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                       );
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
+    // Otherwise the kernel will be launched from cuda:0 device
    // Cast to char to avoid compiler warning about narrowing
    at::cuda::CUDAGuard device_guard{(char)u.get_device()};
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -360,14 +360,3 @@ std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
    int64_t size);
 int64_t open_mem_handle(torch::Tensor& mem_handle);
 void free_shared_buffer(int64_t buffer);
 #ifdef USE_ROCM
 fptr_t init_custom_qr(int64_t rank, int64_t world_size,
                      std::optional<int64_t> qr_max_size = std::nullopt);
 void qr_destroy(fptr_t _fa);
 torch::Tensor qr_get_handle(fptr_t _fa);
 void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
 void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                   int64_t quant_level, bool cast_bf2half = false);
 int64_t qr_max_size();
 #endif
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@ -29,12 +29,26 @@ struct sm100_fp8_config_default {
 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_M256 {
-  // M in (64, 256]
+  // M in (128, 256]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
+  using ClusterShape = Shape<_2, _2, _1>;
  using Cutlass3xGemm =
      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                            KernelSchedule, EpilogueSchedule>;
 };
 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_M128 {
  // M in (64, 128]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
  using TileShape = Shape<_128, _128, _256>;
  using ClusterShape = Shape<_2, _4, _1>;
  using Cutlass3xGemm =
      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                            KernelSchedule, EpilogueSchedule>;
@ -43,26 +57,12 @@ struct sm100_fp8_config_M256 {
 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_M64 {
-  // M in (16, 64]
+  // M in [1, 64]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_64, _64, _128>;
+  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _1, _1>;
+  using ClusterShape = Shape<_1, _8, _1>;
  using Cutlass3xGemm =
      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                            KernelSchedule, EpilogueSchedule>;
 };
 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_M16 {
  // M in [1, 16]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
  using TileShape = Shape<_64, _64, _128>;
  using ClusterShape = Shape<_1, _4, _1>;
  using Cutlass3xGemm =
      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                            KernelSchedule, EpilogueSchedule>;
@ -82,27 +82,27 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
  using Cutlass3xGemmDefault =
      typename sm100_fp8_config_default<InType, OutType,
                                        Epilogue>::Cutlass3xGemm;
  using Cutlass3xGemmM16 =
      typename sm100_fp8_config_M16<InType, OutType, Epilogue>::Cutlass3xGemm;
  using Cutlass3xGemmM64 =
      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
  using Cutlass3xGemmM128 =
      typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
  using Cutlass3xGemmM256 =
      typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
  uint32_t const m = a.size(0);
  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-  if (mp2 <= 16) {
+  if (mp2 <= 64) {
-    // m in [1, 16]
+    // m in [1, 64]
    return cutlass_gemm_caller<Cutlass3xGemmM16>(
        out, a, b, std::forward<EpilogueArgs>(args)...);
  } else if (mp2 <= 64) {
    // m in (16, 64]
    return cutlass_gemm_caller<Cutlass3xGemmM64>(
        out, a, b, std::forward<EpilogueArgs>(args)...);
  } else if (mp2 <= 128) {
    // m in (64, 128]
    return cutlass_gemm_caller<Cutlass3xGemmM128>(
        out, a, b, std::forward<EpilogueArgs>(args)...);
  } else if (mp2 <= 256) {
-    // m in (64, 256]
+    // m in (128, 256]
    return cutlass_gemm_caller<Cutlass3xGemmM256>(
        out, a, b, std::forward<EpilogueArgs>(args)...);
  } else {
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@ -241,7 +241,7 @@ void get_cutlass_moe_mm_data(
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
 #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+    (defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90)
  get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                 problem_sizes2, input_permutation,
                                 output_permutation, num_experts, n, k,
@ -252,7 +252,7 @@ void get_cutlass_moe_mm_data(
      false,
      "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
      "CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90");
 }
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@ -265,8 +265,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
  // This function currently gets compiled only if we have a valid cutlass moe
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
                                      problem_sizes2, expert_num_tokens,
                                      num_local_experts, padded_m, n, k);
@ -276,7 +275,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
      false,
      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
      "for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90");
 }
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@ -561,7 +561,7 @@ void scaled_fp4_experts_quant_sm100a(
  TORCH_CHECK(output_scale.size(1) * 4 == padded_k);
  auto in_dtype = input.dtype();
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
  const cudaStream_t stream =
      at::cuda::getCurrentCUDAStream(input.get_device());
  if (in_dtype == at::ScalarType::Half) {
@ -579,4 +579,4 @@ void scaled_fp4_experts_quant_sm100a(
  } else {
    TORCH_CHECK(false, "Expected input data type to be half or bfloat16");
  }
-}
+}
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@ -347,7 +347,7 @@ void scaled_fp4_quant_sm100a(torch::Tensor const& output,
  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
  // We don't support e8m0 scales at this moment.
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
@ -267,7 +267,7 @@ void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
              B_sf.sizes()[1], ")");
  auto out_dtype = D.dtype();
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  at::cuda::CUDAGuard device_guard{(char)A.get_device()};
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
  if (out_dtype == at::ScalarType::Half) {
--- a/csrc/quickreduce/base.h
+++ b/csrc/quickreduce/base.h
@ -1,338 +0,0 @@
 #pragma once
 #include <cstdint>
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
 #define __quickreduce_device_inline__ __device__ __forceinline__
 #define __quickreduce_launch_bounds_two_shot__ __launch_bounds__(256, 4)
 #define __quickreduce_launch_bounds_one_shot__ __launch_bounds__(512, 4)
 namespace quickreduce {
 typedef __hip_bfloat16 nv_bfloat16;
 typedef __hip_bfloat162 nv_bfloat162;
 using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
 using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
 // Setup acquire-release semantics for vector memory reads (mubuf instruction)
 // as per architecture.
 #if defined(__gfx942__)
 // CDNA3: Scope bits sc0, sc1
  #define MUBUF_ACQUIRE 16
  #define MUBUF_RELEASE 16
 #elif (defined(__gfx908__) || defined(__gfx90a__))
 // CDNA1 and CDNA2 - glc bit
  #define MUBUF_ACQUIRE 1
  #define MUBUF_RELEASE 0
 #endif
 static constexpr int kNegOne = 0xBC00BC00;  // {-1, -1}, fp16x2_t
 // Number of atoms (4xf16x2_t) processed by a single thread
 static constexpr int kAtoms = 8;
 // We use a workgroup of 256 threads
 static constexpr int kBlockSize = 256;
 static constexpr int kAtomStride = kBlockSize;
 // Size and atom stride of source/destination data that the block will
 // process.
 // Workgroup scope = Tile = (256 threads x 8 atoms x 16B)
 static constexpr int kTileSize = kBlockSize * kAtoms * sizeof(int32x4_t);
 // Max number of blocks. 304 CUs on MI300
 static constexpr int kMaxNumBlocks = 304 * 4;
 // Standard CDNA wavefront size.
 static constexpr int kWavefront = 64;
 // 256 thread, 4 wavefronts.
 static dim3 constexpr kBlockTwoShot = {kWavefront, kBlockSize / kWavefront, 1};
 // Number of threads in a group for quantization
 // It corresponds to 32 F16 elements in quantization block
 static constexpr int kThreadGroupSize = 8;
 // Methods
 __quickreduce_device_inline__ __host__ unsigned long divceil(unsigned long x,
                                                             unsigned long y) {
  return ((x + y - 1) / y);
 }
 union BufferResource {
  __quickreduce_device_inline__ constexpr BufferResource()
      : config(0x00020000U) {}
  __quickreduce_device_inline__ constexpr BufferResource(void* buffer_address,
                                                         uint32_t buffer_size)
      : address(buffer_address), range(buffer_size), config(0x00020000U) {}
  int32x4_t descriptor;
  struct {
    void* address;  // 8B, out of which first 48b is address, and 16b is stride
    // (unused)
    uint32_t range;   // Byte range for the buffer resource
    uint32_t config;  // Constant, DFMT=32b
  };
 };
 __quickreduce_device_inline__ static int32x4_t buffer_load_dwordx4(
    int32x4_t srsrc, int32_t voffset, int32_t soffset,
    int32_t aux) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
 __quickreduce_device_inline__ static void buffer_store_dwordx4(
    int32x4_t data, int32x4_t srsrc, int32_t voffset, int32_t soffset,
    int32_t aux) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
 __quickreduce_device_inline__ static void set_fp16_ovfl(bool const value) {
 #if defined(__gfx942__)
  if (value) {
    asm volatile("s_setreg_imm32_b32 0xdc1, 1;" ::);
  } else {
    asm volatile("s_setreg_imm32_b32 0xdc1, 0;" ::);
  }
 #endif
 }
 union bf162_int_union {
  int i;
  nv_bfloat162 bf2;
 };
 template <typename T>
 __quickreduce_device_inline__ void packed_assign_add(int32x4_t* A,
                                                     int32x4_t* B);
 template <>
 __quickreduce_device_inline__ void packed_assign_add<half>(int32x4_t* A,
                                                           int32x4_t* B) {
  int32x4_t& tR_fragment = A[0];
  int32x4_t& tA_fragment = B[0];
  asm volatile("v_pk_add_f16 %0, %1, %2"
               : "=v"(tR_fragment[0])
               : "v"(tR_fragment[0]), "v"(tA_fragment[0]));
  asm volatile("v_pk_add_f16 %0, %1, %2"
               : "=v"(tR_fragment[1])
               : "v"(tR_fragment[1]), "v"(tA_fragment[1]));
  asm volatile("v_pk_add_f16 %0, %1, %2"
               : "=v"(tR_fragment[2])
               : "v"(tR_fragment[2]), "v"(tA_fragment[2]));
  asm volatile("v_pk_add_f16 %0, %1, %2"
               : "=v"(tR_fragment[3])
               : "v"(tR_fragment[3]), "v"(tA_fragment[3]));
 }
 template <>
 __quickreduce_device_inline__ void packed_assign_add<nv_bfloat16>(
    int32x4_t* A, int32x4_t* B) {
  nv_bfloat162* tA = reinterpret_cast<nv_bfloat162*>(A);
  nv_bfloat162* tB = reinterpret_cast<nv_bfloat162*>(B);
 #pragma unroll
  for (int i = 0; i < 4; i++) {
    tA[i] = __hadd2(tA[i], tB[i]);
  }
 }
 template <typename T>
 __quickreduce_device_inline__ int packed_max(int a, int b);
 template <>
 __quickreduce_device_inline__ int packed_max<half>(int a, int b) {
  int result;
  asm volatile("v_pk_max_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
  return result;
 }
 template <>
 __quickreduce_device_inline__ int packed_max<nv_bfloat16>(int a, int b) {
  bf162_int_union A, B, R;
  A.i = a;
  B.i = b;
  R.bf2 = __hmax2(A.bf2, B.bf2);
  return R.i;
 }
 template <typename T>
 __quickreduce_device_inline__ int packed_min(int a, int b);
 template <>
 __quickreduce_device_inline__ int packed_min<half>(int a, int b) {
  int result;
  asm volatile("v_pk_min_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
  return result;
 }
 template <>
 __quickreduce_device_inline__ int packed_min<nv_bfloat16>(int a, int b) {
  bf162_int_union A, B, R;
  A.i = a;
  B.i = b;
  R.bf2 = __hmin2(A.bf2, B.bf2);
  return R.i;
 }
 template <typename T>
 __quickreduce_device_inline__ int packed_abs_max(int a, int b);
 template <>
 __quickreduce_device_inline__ int packed_abs_max<half>(int a, int b) {
  half2 wmaxh2 = __builtin_bit_cast(half2, a);
  half2 wminh2 = __builtin_bit_cast(half2, b);
  half2 wblockmaxh2;
  wblockmaxh2.x =
      __hgt(__habs(wmaxh2.x), __habs(wminh2.x)) ? wmaxh2.x : wminh2.x;
  wblockmaxh2.y =
      __hgt(__habs(wmaxh2.y), __habs(wminh2.y)) ? wmaxh2.y : wminh2.y;
  return __builtin_bit_cast(int, wblockmaxh2);
 }
 template <>
 __quickreduce_device_inline__ int packed_abs_max<nv_bfloat16>(int a, int b) {
  bf162_int_union A, B, R;
  A.i = a;
  B.i = b;
  R.bf2.x = __hgt(__habs(A.bf2.x), __habs(B.bf2.x)) ? A.bf2.x : B.bf2.x;
  R.bf2.y = __hgt(__habs(A.bf2.y), __habs(B.bf2.y)) ? A.bf2.y : B.bf2.y;
  return R.i;
 }
 template <typename T>
 __quickreduce_device_inline__ int packed_add(int a, int b);
 template <>
 __quickreduce_device_inline__ int packed_add<half>(int a, int b) {
  int result;
  asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
  return result;
 }
 template <>
 __quickreduce_device_inline__ int packed_add<nv_bfloat16>(int a, int b) {
  bf162_int_union A, B, R;
  A.i = a;
  B.i = b;
  R.bf2 = __hadd2(A.bf2, B.bf2);
  return R.i;
 }
 template <>
 __quickreduce_device_inline__ int packed_add<int16_t>(int a, int b) {
  int result;
  asm volatile("v_pk_add_i16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
  return result;
 }
 template <typename T>
 __quickreduce_device_inline__ int packed_sub(int a, int b);
 template <>
 __quickreduce_device_inline__ int packed_sub<half>(int a, int b) {
  int result;
  // MI300 lacks packed fp16 sub instruction. So we do -1 * min + max
  asm volatile("v_pk_fma_f16 %0, %1, %2 %3"
               : "=v"(result)
               : "v"(kNegOne), "v"(b), "v"(a));
  return result;
 }
 template <>
 __quickreduce_device_inline__ int packed_sub<nv_bfloat16>(int a, int b) {
  bf162_int_union A, B, R;
  A.i = a;
  B.i = b;
  R.bf2 = __hsub2(A.bf2, B.bf2);
  return R.i;
 }
 template <typename T>
 __quickreduce_device_inline__ int packed_mul(int a, int b);
 template <>
 __quickreduce_device_inline__ int packed_mul<half>(int a, int b) {
  int result;
  asm volatile("v_pk_mul_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
  return result;
 }
 template <>
 __quickreduce_device_inline__ int packed_mul<nv_bfloat16>(int a, int b) {
  nv_bfloat162* tA = reinterpret_cast<nv_bfloat162*>(&a);
  nv_bfloat162* tB = reinterpret_cast<nv_bfloat162*>(&b);
  nv_bfloat162 tR = __hmul2(*tA, *tB);
  return *(reinterpret_cast<int*>(&tR));
 }
 template <typename T>
 __quickreduce_device_inline__ int packed_rcp(int a);
 template <>
 __quickreduce_device_inline__ int packed_rcp<half>(int a) {
  return __builtin_bit_cast(int, h2rcp(__builtin_bit_cast(half2, a)));
 }
 template <>
 __quickreduce_device_inline__ int packed_rcp<nv_bfloat16>(int a) {
  bf162_int_union A, R;
  A.i = a;
  R.bf2 = h2rcp(A.bf2);
  return R.i;
 }
 // changes dtype
 __quickreduce_device_inline__ float T2float_cast(half a) {
  return __half2float(a);
 }
 __quickreduce_device_inline__ float T2float_cast(nv_bfloat16 a) {
  return __bfloat162float(a);
 }
 template <typename T>
 __quickreduce_device_inline__ int group_abs_max(int32x4_t atom) {
  const int group_leader = (threadIdx.x / kThreadGroupSize) * kThreadGroupSize;
  int wmax, wmin, wblockmax;
  int a, b;
  a = packed_max<T>(atom[0], atom[1]);
  b = packed_max<T>(atom[2], atom[3]);
  wmax = packed_max<T>(a, b);
  a = packed_min<T>(atom[0], atom[1]);
  b = packed_min<T>(atom[2], atom[3]);
  wmin = packed_min<T>(a, b);
  // Reduce the max among a group of threads
  // Note: This is basically 2 blocks of values setup as the
  // upper/lower halves of the f16x2_t
  for (int i = 1; i < kThreadGroupSize; i <<= 1) {
    int x = __shfl_down(wmax, i);
    wmax = packed_max<T>(wmax, x);
    int y = __shfl_down(wmin, i);
    wmin = packed_min<T>(wmin, y);
  }
  wblockmax = packed_abs_max<T>(wmax, wmin);
  // Share with the cohort
  wblockmax = __shfl(wblockmax, group_leader);
  return wblockmax;
 }
 __quickreduce_device_inline__ void set_sync_flag(uint32_t* flag_ptr,
                                                 uint32_t flag) {
  __atomic_store_n(flag_ptr, flag, __ATOMIC_RELEASE);
 }
 __quickreduce_device_inline__ void wait_sync_flag(uint32_t* flag_ptr,
                                                  uint32_t flag) {
  while (__atomic_load_n(flag_ptr, __ATOMIC_RELAXED) != flag) {
  }
 }
 }  // namespace quickreduce
--- a/csrc/quickreduce/quick_reduce.h
+++ b/csrc/quickreduce/quick_reduce.h
@ -1,196 +0,0 @@
 #pragma once
 #include <vector>
 #include <hip/hip_runtime.h>
 #include "quick_reduce_impl.cuh"
 #define HIP_CHECK(err)                                                     \
  do {                                                                     \
    hipError_t err_ = (err);                                               \
    if (err_ != hipSuccess) {                                              \
      std::printf("HIP error %d at %s:%d. %s\n", err_, __FILE__, __LINE__, \
                  hipGetErrorString(err_));                                \
      throw std::runtime_error("HIP error");                               \
    }                                                                      \
  } while (0)
 namespace quickreduce {
 using fptr_t = int64_t;
 static_assert(sizeof(void*) == sizeof(fptr_t));
 template <typename AllReduceKernel, typename T>
 __global__ __quickreduce_launch_bounds_two_shot__ static void
 allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
                            int rank, uint8_t** dbuffer_list,
                            uint32_t data_offset, uint32_t flag_color) {
  int block = blockIdx.x;
  int grid = gridDim.x;
  while (block < num_blocks) {
    AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
                         flag_color);
    block += grid;
    flag_color++;
  }
 }
 #define TWOSHOT_DISPATCH(__codec)                                           \
  if (world_size == 2) {                                                    \
    using LineCodec = __codec<T, 2>;                                        \
    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
                       flag_color);                                         \
  } else if (world_size == 4) {                                             \
    using LineCodec = __codec<T, 4>;                                        \
    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
                       flag_color);                                         \
  } else if (world_size == 8) {                                             \
    using LineCodec = __codec<T, 8>;                                        \
    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
                       num_blocks, rank, dbuffer_list, data_offset,         \
                       flag_color);                                         \
  }
 enum QuickReduceQuantLevel {
  F16 = 0,
  INT8 = 1,
  INT6 = 2,
  INT4 = 3,
 };
 struct DeviceComms {
  // Max problem size is 2GB (in bytes) or half of uint32_t max value.
  int64_t kMaxProblemSize =
      static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
  // Max TP-8
  static int constexpr kMaxWorldSize = 8;
  bool initialized = false;
  uint32_t flag_color = 1;
  int world_size;
  int rank;
  uint8_t* dbuffer;
  uint8_t** dbuffer_list;
  hipIpcMemHandle_t buffer_ipc_handle;
  std::vector<hipIpcMemHandle_t> all_buffer_ipc_handles;
  std::vector<uint8_t*> buffer_list;
  uint32_t data_offset;
  DeviceComms() : initialized(false), world_size(1), rank(0) {}
  ~DeviceComms() { destroy(); }
  void init(int world_size, int rank,
            std::optional<int64_t> max_problem_size = std::nullopt) {
    destroy();
    this->world_size = world_size;
    this->rank = rank;
    if (max_problem_size.has_value() && max_problem_size.value() > 0) {
      this->kMaxProblemSize = max_problem_size.value();
    }
    // Allocate buffer size for worst case: F16 2-stage buffer.
    uint32_t flags_buffer_size =
        2 * world_size * kMaxNumBlocks * sizeof(uint32_t);
    static int64_t data_buffer_size = 2 * this->kMaxProblemSize;
    int64_t total_buffer_size = flags_buffer_size + data_buffer_size;
    data_offset = flags_buffer_size;
    HIP_CHECK(hipExtMallocWithFlags((void**)&dbuffer, total_buffer_size,
                                    hipDeviceMallocUncached));
    // Clear the flags buffer.
    HIP_CHECK(hipMemset(dbuffer, 0, flags_buffer_size));
    // Device-side list of IPC buffers.
    buffer_list.resize(world_size);
    HIP_CHECK(hipMalloc(&dbuffer_list, world_size * sizeof(uint8_t*)));
    // Create IPC handles for rank's communication buffer.
    all_buffer_ipc_handles.resize(world_size);
    HIP_CHECK(hipIpcGetMemHandle(&buffer_ipc_handle, dbuffer));
    initialized = true;
  }
  int get_world_size() { return world_size; }
  int get_rank() { return rank; }
  bool status() { return initialized; }
  hipIpcMemHandle_t const get_handle() { return buffer_ipc_handle; }
  void destroy() {
    if (initialized) {
      for (int i = 0; i < world_size; i++) {
        if (i != rank) {
          HIP_CHECK(hipIpcCloseMemHandle(dbuffer_list[i]));
        }
      }
      HIP_CHECK(hipFree(dbuffer));
      HIP_CHECK(hipFree(dbuffer_list));
      initialized = false;
    }
  }
  void open_ipc_handles(std::vector<hipIpcMemHandle_t> const& ipc_handles) {
    assert(ipc_handles.size() == all_buffer_ipc_handles.size());
    for (int i = 0; i < world_size; i++) {
      all_buffer_ipc_handles[i] = ipc_handles[i];
    }
    // Open device memory access to the IPC communication buffers.
    // Note: For our own rank, we do not need to open a handle.
    for (int i = 0; i < world_size; i++) {
      if (i != rank) {
        HIP_CHECK(hipIpcOpenMemHandle((void**)&buffer_list[i],
                                      all_buffer_ipc_handles[i],
                                      hipIpcMemLazyEnablePeerAccess));
      } else {
        buffer_list[i] = dbuffer;
      }
    }
    HIP_CHECK(hipMemcpy(dbuffer_list, buffer_list.data(),
                        world_size * sizeof(uint8_t*), hipMemcpyHostToDevice));
  }
  template <typename T, bool cast_bf2half>
  void allreduce(T const* A, T* B, uint32_t N, int quant_level,
                 hipStream_t stream) {
    if (world_size != 2 && world_size != 4 && world_size != 8) {
      throw std::runtime_error("All Reduce not supported for world_size = " +
                               std::to_string(world_size));
    }
    // Configuration.
    uint32_t msg_size = N * sizeof(T);
    uint32_t num_blocks = divceil(msg_size, kTileSize);
    uint32_t grid = min(kMaxNumBlocks, num_blocks);
    auto quant_level_ = static_cast<QuickReduceQuantLevel>(quant_level);
    switch (quant_level_) {
      case QuickReduceQuantLevel::INT8:
        TWOSHOT_DISPATCH(CodecQ8)
        break;
      case QuickReduceQuantLevel::INT6:
        TWOSHOT_DISPATCH(CodecQ6)
        break;
      case QuickReduceQuantLevel::INT4:
        TWOSHOT_DISPATCH(CodecQ4)
        break;
      default:
        TWOSHOT_DISPATCH(CodecFP)
        break;
    }
    HIP_CHECK(cudaGetLastError());
    // Rotate the flag color.
    flag_color += divceil(N, grid);
  }
 };
 }  // namespace quickreduce
--- a/csrc/quickreduce/quick_reduce_impl.cuh
+++ b/csrc/quickreduce/quick_reduce_impl.cuh
@ -1,698 +0,0 @@
 #pragma once
 #include <hip/hip_runtime.h>
 #include "base.h"
 namespace quickreduce {
 struct CodecBase {
  const int thread;
  const int rank;
  const int group_leader;
  __quickreduce_device_inline__ CodecBase(int thread, int rank)
      : thread(thread),
        rank(rank),
        group_leader((threadIdx.x / kThreadGroupSize) * kThreadGroupSize) {
    set_fp16_ovfl(true);
  }
 };
 // Default full precision codec.
 template <typename T, int world_size>
 struct CodecFP : public CodecBase {
  static constexpr int kWorldSize = world_size;
  static constexpr int kRankAtoms = kAtoms / kWorldSize;
  // Codec tile size process by this workgroup.
  // Each thread processes atoms of f16x8_t (16B).
  static constexpr int kRankTransmittedTileSize =
      kBlockSize * kRankAtoms * sizeof(int32x4_t);
  static_assert(kRankTransmittedTileSize % 16 == 0,
                "kRankTransmittedTileSize must be 16B aligned.");
  // Total tile size for the collective communication.
  static constexpr int kTransmittedTileSize =
      kRankTransmittedTileSize * kWorldSize;
  __quickreduce_device_inline__ CodecFP(int thread, int rank)
      : CodecBase(thread, rank) {}
  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
                                          const int32x4_t* __restrict__ data) {
    for (int i = 0; i < kRankAtoms; i++) {
      __builtin_nontemporal_store(data[i], send_buffer + thread);
      send_buffer += kAtomStride;
    }
  }
  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
                                          int32x4_t* __restrict__ data) {
    for (int i = 0; i < kRankAtoms; i++) {
      data[i] = __builtin_nontemporal_load(*recv_buffer + thread);
      *recv_buffer += kAtomStride;
    }
  }
 };
 // Int4 symmetric quantization codec.
 // We quantize the FP16 data to block-scaled Int4 in blocks of 4 *
 // kThreadGroupSize.
 template <typename T, int world_size>
 struct CodecQ4 : public CodecBase {
  static constexpr int kWorldSize = world_size;
  // Codec tile size process by this workgroup.
  // Each threads processes a fragment of fp16x8_t (16B),
  // into a int4x8_t (4B) and a fp16 scale shared among 32 values.
  static constexpr int kRankAtoms = kAtoms / kWorldSize;
  static constexpr int kRankTileStride = 1152;
  static constexpr int kRankTileScaleOffset = 1024;
  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
  static_assert(kRankTransmittedTileSize % 16 == 0,
                "kRankTransmittedTileSize must be 16B aligned.");
  static constexpr int kRankBufferTileStride =
      kRankTileStride / sizeof(int32x4_t);
  // Total tile size for the collective communication.
  static constexpr int kTransmittedTileSize =
      kRankTransmittedTileSize * kWorldSize;
  // Constants configuration
  // {-1/8.0h, -1/8.0h}, f16x2_t
  static constexpr int kScaleFactor =
      std::is_same<T, half>::value ? 0xB000B000 : 0xBE00BE00;
  // {1e-7, 1e-7}, f16x2_t
  static constexpr int kScaleEpsilon =
      std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
  // {-8, -8}, f16x2_t
  static constexpr int kRangeMin =
      std::is_same<T, half>::value ? 0xC800C800 : 0xC100C100;
  // {+7, +7}, f16x2_t
  static constexpr int kRangeMax =
      std::is_same<T, half>::value ? 0x47004700 : 0x40E040E0;
  // {+8, +8}, int16x2_t
  static constexpr int kRangeBias = 0x00080008;
  __quickreduce_device_inline__ CodecQ4(int thread, int rank)
      : CodecBase(thread, rank) {}
  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
                                          const int32x4_t* __restrict__ data) {
    for (int k = 0; k < kRankAtoms; k++) {
      int32x4_t const atom = data[k];
      // Compute the absolute maximum of the atom in the thread group
      // In 2 blocks of values, upper/lower halves of the f16x2_t
      int wblockmax = group_abs_max<T>(atom);
      // Derive scales
      int decoding_scale;
      int encoding_scale;
      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
      encoding_scale = packed_rcp<T>(encoding_scale);
      // Apply scales to get quantized values
      int32x4_t w;
      for (int i = 0; i < 4; i++) {
        w[i] = packed_mul<T>(atom[i], encoding_scale);
        w[i] = packed_max<T>(w[i], kRangeMin);
        w[i] = packed_min<T>(w[i], kRangeMax);
      }
      // Convert from f16x2_t to uint16x2_t
      int32x4_t q;
      {
        int16_t* qi = reinterpret_cast<int16_t*>(&q);
        T* wh = reinterpret_cast<T*>(&w);
        for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
        for (int i = 0; i < 4; i++) {
          q[i] = packed_add<int16_t>(q[i], kRangeBias);
        }
      }
      // Pack 8 x q4 into int32_t
      int qw = q[0] | (q[1] << 4) | (q[2] << 8) | (q[3] << 12);
      // Write quantized atom to send_buffer
      // note: only the group leader stores the scale
      uint8_t* atom_ptr =
          reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
      int32_t* qw_ptr = reinterpret_cast<int32_t*>(atom_ptr) + thread;
      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
                    (thread / 8);
      __builtin_nontemporal_store(qw, qw_ptr);
      if (threadIdx.x == group_leader) {
        __builtin_nontemporal_store(decoding_scale, qs_ptr);
      }
    }
  }
  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
                                          int32x4_t* __restrict__ data) {
    for (int k = 0; k < kRankAtoms; k++) {
      // Directly read quantized atom from recv_buffer
      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
      int32_t* qw_ptr = reinterpret_cast<int32_t*>(atom_ptr) + thread;
      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
                    (thread / 8);
      int32_t qw = __builtin_nontemporal_load(qw_ptr);
      int qs = __builtin_nontemporal_load(qs_ptr);
      *recv_buffer += kRankBufferTileStride;
      // Unpack q4 into f16x8_t
      int32x4_t w;
      {
        static constexpr uint kMask000F = 0x000F000F;
        static constexpr uint kHalf2_1024 =
            0x64006400;  // {1024.0, 1024.0}, fp16x2_t
        static uint constexpr kHalf2_1032 =
            0xE408E408;  // {-1032.0, -1032.0}, fp16x2_t
        for (int i = 0; i < 4; i++) {
          if constexpr (std::is_same<T, half>::value) {
            int32_t q4 = ((qw >> (i * 4)) & kMask000F) | kHalf2_1024;
            w[i] = packed_add<half>(q4, kHalf2_1032);
          } else {
            int32_t int16_2 = (qw >> (i * 4)) & kMask000F;
            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
          }
        }
      }
      // Apply decoding scales
      for (int i = 0; i < 4; i++) {
        w[i] = packed_mul<T>(w[i], qs);
      }
      data[k] = w;
    }
  }
 };
 // Int6 symmetric quantization codec.
 // We quantize the FP16 data to block-scaled Int6 in blocks of 4 *
 // kThreadGroupSize.
 template <typename T, int world_size>
 struct CodecQ6 : public CodecBase {
  static constexpr int kWorldSize = world_size;
  // Codec tile size process by this workgroup.
  // Each threads processes a fragment of fp16x8_t (16B),
  // into a int6x8_t (4B + 2B) and a fp16 scale shared among 32 values.
  static constexpr int kRankAtoms = kAtoms / kWorldSize;
  static constexpr int kRankTileStride = 1664;
  static constexpr int kRankTileQ2Offset = 1024;
  static constexpr int kRankTileScaleOffset = 1536;
  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
  static_assert(kRankTransmittedTileSize % 16 == 0,
                "kRankTransmittedTileSize must be 16B aligned.");
  static constexpr int kRankBufferTileStride =
      kRankTileStride / sizeof(int32x4_t);
  // Total tile size for the collective communication.
  static constexpr int kTransmittedTileSize =
      kRankTransmittedTileSize * kWorldSize;
  // Constants configuration
  // {-1/32.0h, -1/32.0h}, fp16x2_t
  static constexpr int kScaleFactor =
      std::is_same<T, half>::value ? 0xA800A800 : 0xBD00BD00;
  // {1e-7, 1e-7}, fp16x2_t
  static constexpr int kScaleEpsilon =
      std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
  // {-32, -32}, fp16x2_t
  static constexpr int kRangeMin =
      std::is_same<T, half>::value ? 0xD000D000 : 0xC200C200;
  // {+31, +31}, fp16x2_t
  static constexpr int kRangeMax =
      std::is_same<T, half>::value ? 0x4FC04FC0 : 0x41F841F8;
  // {+32, +32}, int16x2_t
  static constexpr int kRangeBias = 0x00200020;
  __quickreduce_device_inline__ CodecQ6(int thread, int rank)
      : CodecBase(thread, rank) {}
  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
                                          const int32x4_t* __restrict__ data) {
    for (int k = 0; k < kRankAtoms; k++) {
      int32x4_t const atom = data[k];
      // Compute the absolute maximum of the atom in the thread group
      // In 2 blocks of values, upper/lower halves of the f16x2_t
      int wblockmax = group_abs_max<T>(atom);
      // Derive scales
      int decoding_scale;
      int encoding_scale;
      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
      encoding_scale = packed_rcp<T>(encoding_scale);
      // Apply scales to get quantized values
      int32x4_t w;
      for (int i = 0; i < 4; i++) {
        w[i] = packed_mul<T>(atom[i], encoding_scale);
        w[i] = packed_max<T>(w[i], kRangeMin);
        w[i] = packed_min<T>(w[i], kRangeMax);
      }
      // Convert from f16x2_t to uint16x2_t
      int32x4_t q;
      {
        int16_t* qi = reinterpret_cast<int16_t*>(&q);
        T* wh = reinterpret_cast<T*>(&w);
        for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
        for (int i = 0; i < 4; i++) {
          q[i] = packed_add<int16_t>(q[i], kRangeBias);
        }
      }
      // Pack 8 x q6 into int32_t + int16_t
      uint32_t q4w;
      uint16_t q2w = 0;
      q4w = (q[0] & 0x000F000F) | ((q[1] & 0x000F000F) << 4) |
            ((q[2] & 0x000F000F) << 8) | ((q[3] & 0x000F000F) << 12);
      {
        int16_t* tw = reinterpret_cast<int16_t*>(&q);
 #pragma unroll
        for (int i = 0; i < 8; i++) {
          q2w |= (tw[i] >> 4) << (i * 2);
        }
      }
      // Write quantized atom to send_buffer
      // note: only the group leader stores the scale
      uint8_t* atom_ptr =
          reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
      uint32_t* q4w_ptr = reinterpret_cast<uint32_t*>(atom_ptr) + thread;
      uint16_t* q2w_ptr =
          reinterpret_cast<uint16_t*>(atom_ptr + kRankTileQ2Offset) + thread;
      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
                    (thread / 8);
      __builtin_nontemporal_store(q4w, q4w_ptr);
      __builtin_nontemporal_store(q2w, q2w_ptr);
      if (threadIdx.x == group_leader) {
        __builtin_nontemporal_store(decoding_scale, qs_ptr);
      }
    }
  }
  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
                                          int32x4_t* __restrict__ data) {
    for (int k = 0; k < kRankAtoms; k++) {
      // Directly read quantized atom from recv_buffer
      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
      uint32_t* q4w_ptr = reinterpret_cast<uint32_t*>(atom_ptr) + thread;
      uint16_t* q2w_ptr =
          reinterpret_cast<uint16_t*>(atom_ptr + kRankTileQ2Offset) + thread;
      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
                    (thread / 8);
      uint32_t q4w = __builtin_nontemporal_load(q4w_ptr);
      uint16_t q2w = __builtin_nontemporal_load(q2w_ptr);
      int qs = __builtin_nontemporal_load(qs_ptr);
      *recv_buffer += kRankBufferTileStride;
      // Unpack q6 into fp16x8_t
      int32x4_t w;
      {
        static uint constexpr kMask000F = 0x000F000F;
        static uint constexpr kHalf2_1024 =
            0x64006400;  // {1024.0, 1024.0}, fp16x2_t
        static uint constexpr kHalf2_1056 =
            0xE420E420;  // {-1056.0, -1056.0}, fp16x2_t
 #pragma unroll
        for (int i = 0; i < 4; i++) {
          int32_t q4 = q4w & kMask000F;
          int32_t q2 = (q2w & 0x3) | ((q2w & 0xC) << 14);
          q4w >>= 4;
          q2w >>= 4;
          if constexpr (std::is_same<T, half>::value) {
            int32_t q6 = q4 | (q2 << 4) | kHalf2_1024;
            asm volatile("v_pk_add_f16 %0, %1, %2"
                         : "=v"(w[i])
                         : "v"(q6), "v"(kHalf2_1056));
          } else {
            int32_t int16_2 = q4 | (q2 << 4);
            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
          }
        }
      }
      // Apply decoding scales
      for (int i = 0; i < 4; i++) {
        w[i] = packed_mul<T>(w[i], qs);
      }
      // That's pretty much it...
      data[k] = w;
    }
  }
 };
 // Int8 symmetric quantization codec.
 // We quantize the FP16 data to block-scaled Int8 in blocks of 4 *
 // kThreadGroupSize.
 template <typename T, int world_size>
 struct CodecQ8 : public CodecBase {
  static constexpr int kWorldSize = world_size;
  // Codec tile size process by this workgroup.
  // Each threads processes a fragment of f16x8_t (16B),
  // into a int8x8_t (8B) and a f16 scale shared among 32 values.
  static constexpr int kRankAtoms = kAtoms / kWorldSize;
  static constexpr int kRankTileStride = 2176;
  static constexpr int kRankTileScaleOffset = 2048;
  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
  static_assert(kRankTransmittedTileSize % 16 == 0,
                "kRankTileSize must be 16B aligned.");
  static constexpr int kRankBufferTileStride =
      kRankTileStride / sizeof(int32x4_t);
  // Total tile size for the collective communication.
  static constexpr int kTransmittedTileSize =
      kRankTransmittedTileSize * kWorldSize;
  // Constants configuration
  // {-1/128.0h, -1/128.0h}, f16x2_t
  static constexpr int kScaleFactor =
      std::is_same<T, half>::value ? 0xA000A000 : 0xBC00BC00;
  // {1e-7, 1e-7}, f16x2_t
  static constexpr int kScaleEpsilon =
      std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
  // {-128, -128}, f16x2_t
  static constexpr int kRangeMin =
      std::is_same<T, half>::value ? 0xD800D800 : 0xC300C300;
  // {+127, +127}, f16x2_t
  static constexpr int kRangeMax =
      std::is_same<T, half>::value ? 0x57F057F0 : 0x42FE42FE;
  // {+128, +128}, int16x2_t
  static constexpr int kRangeBias = 0x00800080;
  __quickreduce_device_inline__ CodecQ8(int thread, int rank)
      : CodecBase(thread, rank) {}
  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
                                          int32x4_t const* __restrict__ data) {
    for (int k = 0; k < kRankAtoms; k++) {
      int32x4_t const atom = data[k];
      // Compute the absolute maximum of the atom in the thread group
      // In 2 blocks of values, upper/lower halves of the f16x2_t
      int wblockmax = group_abs_max<T>(atom);
      // Derive scales
      int decoding_scale;
      int encoding_scale;
      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
      encoding_scale = packed_rcp<T>(encoding_scale);
      // Apply scales to get quantized values
      int32x4_t w;
      for (int i = 0; i < 4; i++) {
        w[i] = packed_mul<T>(atom[i], encoding_scale);
        w[i] = packed_max<T>(w[i], kRangeMin);
        w[i] = packed_min<T>(w[i], kRangeMax);
      }
      // Convert from f16x2_t to uint16x2_t
      int32x4_t q;
      {
        int16_t* qi = reinterpret_cast<int16_t*>(&q);
        T* wh = reinterpret_cast<T*>(&w);
        for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
        for (int i = 0; i < 4; i++) {
          q[i] = packed_add<int16_t>(q[i], kRangeBias);
        }
      }
      // Pack 8 x q8 into int32x2_t
      int32x2_t qw;
      qw[0] = q[0] | (q[1] << 8);
      qw[1] = q[2] | (q[3] << 8);
      // Write quantized atom to send_buffer
      // note: only the group leader stores the scale
      uint8_t* atom_ptr =
          reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
      int32x2_t* qw_ptr = reinterpret_cast<int32x2_t*>(atom_ptr) + thread;
      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
                    (thread / 8);
      __builtin_nontemporal_store(qw, qw_ptr);
      if (threadIdx.x == group_leader) {
        __builtin_nontemporal_store(decoding_scale, qs_ptr);
      }
    }
  }
  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
                                          int32x4_t* __restrict__ data) {
    for (int k = 0; k < kRankAtoms; k++) {
      // Directly read quantized atom from recv_buffer
      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
      int32x2_t* qw_ptr = reinterpret_cast<int32x2_t*>(atom_ptr) + thread;
      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
                    (thread / 8);
      int32x2_t qw = __builtin_nontemporal_load(qw_ptr);
      int qs = __builtin_nontemporal_load(qs_ptr);
      *recv_buffer += kRankBufferTileStride;
      // Unpack q8 into fp16x8_t
      int32x4_t w;
      {
        static uint constexpr kMask00FF = 0x00FF00FF;
        // {1024.0, 1024.0}, fp16x2_t
        static uint constexpr kHalf2_1024 = 0x64006400;
        // {-1152.0, -1152.0}, fp16x2_t
        static uint constexpr kHalf2_1152 = 0xE480E480;
 #pragma unroll
        for (int i = 0; i < 4; i++) {
          if constexpr (std::is_same<T, half>::value) {
            int32_t q8 =
                ((qw[i / 2] >> ((i % 2) * 8)) & kMask00FF) | kHalf2_1024;
            w[i] = packed_add<half>(q8, kHalf2_1152);
          } else {
            int32_t int16_2 = (qw[i / 2] >> ((i % 2) * 8)) & kMask00FF;
            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
          }
        }
      }
      // Apply decoding scales
      for (int i = 0; i < 4; i++) {
        w[i] = packed_mul<T>(w[i], qs);
      }
      data[k] = w;
    }
  }
 };
 // Twoshot All Reduce
 template <typename T, class Codec, bool cast_bf2half>
 struct AllReduceTwoshot {
  static_assert(sizeof(T) == 2);
  static constexpr int kWorldSize = Codec::kWorldSize;
  __device__ static void run(
      T const* __restrict__ input, T* __restrict__ output,
      uint32_t const N,                    // number of elements
      int const block,                     // block index
      int const rank,                      // rank index
      uint8_t** __restrict__ buffer_list,  // communication buffers
      uint32_t const data_offset,          // offset to start of the data buffer
      uint32_t flag_color) {
    // Topology
    int thread = threadIdx.x + threadIdx.y * kWavefront;
    uint8_t* rank_buffer = buffer_list[rank];
    Codec codec(thread, rank);
    int block_id = blockIdx.x;
    int grid_size = gridDim.x;
    // --------------------------------------------------------
    // Read input into registers
    int32x4_t tA[kAtoms];
    BufferResource src_buffer(const_cast<T*>(input), N * sizeof(T));
    uint32_t src_offset = block * kTileSize + thread * sizeof(int32x4_t);
    for (int i = 0; i < kAtoms; i++) {
      tA[i] = buffer_load_dwordx4(src_buffer.descriptor, src_offset, 0, 0);
      src_offset += kAtomStride * sizeof(int32x4_t);
      if constexpr (cast_bf2half) {
        const nv_bfloat162* bf_buf =
            reinterpret_cast<const nv_bfloat162*>(&tA[i]);
        half2 half_buf[4];
 #pragma unroll
        for (int j = 0; j < 4; ++j) {
          float2 f = __bfloat1622float2(bf_buf[j]);
          half_buf[j] = __float22half2_rn(f);
        }
        tA[i] = *reinterpret_cast<const int32x4_t*>(half_buf);
      }
    }
    // --------------------------------------------------------
    // Phase-1A: Write segment data into the communication buffer of the target
    // rank responsible for this segment.
    uint32_t comm_data0_offset =
        data_offset + block_id * Codec::kTransmittedTileSize;
    uint32_t comm_data1_offset =
        grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
    uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
    uint32_t comm_flags1_offset =
        grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
    for (int r = 0; r < kWorldSize; r++) {
      int32x4_t* send_buffer =
          reinterpret_cast<int32x4_t*>(buffer_list[r] + comm_data0_offset +
                                       rank * Codec::kRankTransmittedTileSize);
      codec.send(send_buffer, &tA[r * Codec::kRankAtoms]);
    }
    __syncthreads();
    if (thread < kWorldSize) {
      int r = thread;
      uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(
          buffer_list[r] + comm_flags0_offset + rank * sizeof(uint32_t));
      set_sync_flag(flag_ptr, flag_color);
    }
    // --------------------------------------------------------
    // Phase-1B: Reduce the segment data from the communication buffers.
    int32x4_t tR[Codec::kRankAtoms] = {};
    {
      // Read the data from the communication buffer.
      int32x4_t* recv_buffer =
          reinterpret_cast<int32x4_t*>(rank_buffer + comm_data0_offset);
      uint32_t* flag_ptr =
          reinterpret_cast<uint32_t*>(rank_buffer + comm_flags0_offset);
      for (int r = 0; r < kWorldSize; r++) {
        // Wait for the flags to be set.
        if (thread == 0) {
          wait_sync_flag(&flag_ptr[r], flag_color);
        }
        __syncthreads();
        // note: we reuse tA as temp buffer here
        codec.recv(&recv_buffer, tA);
        for (int i = 0; i < Codec::kRankAtoms; i++) {
          packed_assign_add<T>(&tR[i], &tA[i]);
        }
      }
    }
    // Phase-2: Write the reduced segment to every other rank
    for (int r = 0; r < kWorldSize; r++) {
      int32x4_t* send_buffer =
          reinterpret_cast<int32x4_t*>(buffer_list[r] + comm_data1_offset +
                                       rank * Codec::kRankTransmittedTileSize);
      codec.send(send_buffer, tR);
    }
    __syncthreads();
    if (thread < kWorldSize) {
      int r = thread;
      uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(
          buffer_list[r] + comm_flags1_offset + rank * sizeof(uint32_t));
      set_sync_flag(flag_ptr, flag_color);
    }
    // Phase-2: Read the gather segments from the rank's communication buffer.
    {
      // Read the data from the communication buffer.
      int32x4_t* recv_buffer =
          reinterpret_cast<int32x4_t*>(rank_buffer + comm_data1_offset);
      uint32_t* flag_ptr =
          reinterpret_cast<uint32_t*>(rank_buffer + comm_flags1_offset);
      for (int r = 0; r < kWorldSize; r++) {
        // Wait for the flags to be set.
        if (thread == 0) {
          wait_sync_flag(&flag_ptr[r], flag_color);
        }
        __syncthreads();
        // Gather all reduced and final rank segments into tA.
        codec.recv(&recv_buffer, &tA[r * Codec::kRankAtoms]);
      }
    }
    // --------------------------------------------------------
    // Write the result to output.
    BufferResource dst_buffer(output, N * sizeof(T));
    uint32_t dst_offset = block * kTileSize + thread * sizeof(int32x4_t);
    for (int i = 0; i < kAtoms; i++) {
      if constexpr (cast_bf2half) {
        const half2* half_buf = reinterpret_cast<const half2*>(&tA[i]);
        nv_bfloat162 bf16_buf[4];
 #pragma unroll
        for (int j = 0; j < 4; ++j) {
          float2 f = __half22float2(half_buf[j]);
          bf16_buf[j] = __float22bfloat162_rn(f);
        }
        buffer_store_dwordx4(*reinterpret_cast<const int32x4_t*>(bf16_buf),
                             dst_buffer.descriptor, dst_offset, 0, 0);
      } else {
        buffer_store_dwordx4(tA[i], dst_buffer.descriptor, dst_offset, 0, 0);
      }
      dst_offset += kAtomStride * sizeof(int32x4_t);
    }
  }
 };
 }  // namespace quickreduce
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -1598,6 +1598,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
  const int warpid = threadIdx.x / WARP_SIZE;
  const int laneid = threadIdx.x % WARP_SIZE;
  const int lane2id = laneid % 2;
  const int lane4id = laneid % 4;
  const int lane16id = laneid % 16;
  const int rowid = laneid / 16;
@ -1744,6 +1745,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
    const int klocal_token_idx =
        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
@ -2366,6 +2368,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
  const int warpid = threadIdx.x / WARP_SIZE;
  const int laneid = threadIdx.x % WARP_SIZE;
  const int lane2id = laneid % 2;
  const int lane4id = laneid % 4;
  const int lane16id = laneid % 16;
  const int rowid = laneid / 16;
@ -2511,6 +2514,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
    const int klocal_token_idx =
        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -725,24 +725,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
  custom_ar.def("free_shared_buffer", &free_shared_buffer);
 #ifdef USE_ROCM
  // Quick Reduce all-reduce kernels
  custom_ar.def(
      "qr_all_reduce(int fa, Tensor inp, Tensor out, int quant_level, bool "
      "cast_bf2half) -> ()");
  custom_ar.impl("qr_all_reduce", torch::kCUDA, &qr_all_reduce);
  custom_ar.def("init_custom_qr", &init_custom_qr);
  custom_ar.def("qr_destroy", &qr_destroy);
  custom_ar.def("qr_get_handle", &qr_get_handle);
  custom_ar.def("qr_open_handles(int _fa, Tensor[](b!) handles) -> ()");
  custom_ar.impl("qr_open_handles", torch::kCPU, &qr_open_handles);
  // Max input size in bytes
  custom_ar.def("qr_max_size", &qr_max_size);
 #endif
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -6,106 +6,30 @@
 # docs/assets/contributing/dockerfile-stages-dependency.png
 ARG CUDA_VERSION=12.8.1
 ARG PYTHON_VERSION=3.12
 # By parameterizing the base images, we allow third-party to use their own
 # base images. One use case is hermetic builds with base images stored in
 # private registries that use a different repository naming conventions.
 #
 # Example:
 # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 # By parameterizing the Deadsnakes repository URL, we allow third-party to use
 # their own mirror. When doing so, we don't benefit from the transparent
 # installation of the GPG key of the PPA, as done by add-apt-repository, so we
 # also need a URL for the GPG key.
 ARG DEADSNAKES_MIRROR_URL
 ARG DEADSNAKES_GPGKEY_URL
 # The PyPA get-pip.py script is a self contained script+zip file, that provides
 # both the installer script and the pip base85-encoded zip archive. This allows
 # bootstrapping pip in environment where a dsitribution package does not exist.
 #
 # By parameterizing the URL for get-pip.py installation script, we allow
 # third-party to use their own copy of the script stored in a private mirror.
 # We set the default value to the PyPA owned get-pip.py script.
 #
 # Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py
 ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
 # PIP supports fetching the packages from custom indexes, allowing third-party
 # to host the packages in private mirrors. The PIP_INDEX_URL and
 # PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the
 # default indexes. By letting them empty by default, PIP will use its default
 # indexes if the build process doesn't override the indexes.
 #
 # Uv uses different variables. We set them by default to the same values as
 # PIP, but they can be overridden.
 ARG PIP_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL
 ARG UV_INDEX_URL=${PIP_INDEX_URL}
 ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 # PyTorch provides its own indexes for standard and nightly builds
 ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
 ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 # PIP supports multiple authentication schemes, including keyring
 # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
 # disabled by default, we allow third-party to use keyring authentication for
 # their private Python indexes, while not changing the default behavior which
 # is no authentication.
 #
 # Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support
 ARG PIP_KEYRING_PROVIDER=disabled
 ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM ${BUILD_BASE_IMAGE} AS base
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
-ARG CUDA_VERSION
+ARG CUDA_VERSION=12.8.1
-ARG PYTHON_VERSION
+ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 ARG DEADSNAKES_MIRROR_URL
 ARG DEADSNAKES_GPGKEY_URL
 ARG GET_PIP_URL
 # Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo \
-    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
+    && for i in 1 2 3; do \
-        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-            mkdir -p -m 0755 /etc/apt/keyrings ; \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
+    done \
            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
        fi ; \
    else \
        for i in 1 2 3; do \
            add-apt-repository -y ppa:deadsnakes/ppa && break || \
            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
        done ; \
    fi \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
 ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 # Install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/uv \
    python3 -m pip install uv
@ -139,25 +63,21 @@ WORKDIR /workspace
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
        uv pip install --system \
            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
            --pre pytorch_triton==3.3.0+gitab727c40; \
    fi
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/cuda.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
 # see https://github.com/pytorch/pytorch/pull/123243
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # Override the arch list for flash-attn to reduce the binary size
 ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
@ -168,10 +88,6 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 FROM base AS build
 ARG TARGETPLATFORM
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
 # install build dependencies
 COPY requirements/build.txt requirements/build.txt
@ -182,7 +98,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/build.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 COPY . .
 ARG GIT_REPO_CHECK=0
@ -197,8 +113,6 @@ ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
 ARG USE_SCCACHE
 ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
 ARG SCCACHE_ENDPOINT
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
@ -207,11 +121,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
-        && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
        && tar -xzf sccache.tar.gz \
        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
        && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -249,10 +162,6 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
 #################### DEV IMAGE ####################
 FROM base as dev
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
@ -267,25 +176,21 @@ COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/dev.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 # TODO: Restore to base image after FlashInfer AOT wheel fixed
-FROM ${FINAL_BASE_IMAGE} AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
-ARG CUDA_VERSION
+ARG CUDA_VERSION=12.8.1
-ARG PYTHON_VERSION
+ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 SHELL ["/bin/bash", "-c"]
 ARG DEADSNAKES_MIRROR_URL
 ARG DEADSNAKES_GPGKEY_URL
 ARG GET_PIP_URL
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@ -295,33 +200,17 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
+    && for i in 1 2 3; do \
-        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-            mkdir -p -m 0755 /etc/apt/keyrings ; \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
+    done \
            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
        fi ; \
    else \
        for i in 1 2 3; do \
            add-apt-repository -y ppa:deadsnakes/ppa && break || \
            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
        done ; \
    fi \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
 ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 # Install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/uv \
    python3 -m pip install uv
@ -343,23 +232,19 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
        uv pip install --system \
            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
            --pre pytorch_triton==3.3.0+gitab727c40 ; \
    fi
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system dist/*.whl --verbose \
-        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 # If we need to build FlashInfer wheel before its release:
 # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
-# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
+# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a'
 # $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
 # $ cd flashinfer
 # $ git checkout v0.2.6.post1
@ -369,20 +254,15 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 # Allow specifying a version, Git revision or local .whl file
 ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashinfer"
 ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 ARG FLASHINFER_GIT_REF="v0.2.6.post1"
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
    # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
    if [[ "$CUDA_VERSION" == 12.8* ]]; then \
-        uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL} ; \
+        uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl; \
    else \
-        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0' && \
+        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a' && \
-        git clone ${FLASHINFER_GIT_REPO} --single-branch --branch ${FLASHINFER_GIT_REF} --recursive && \
+        git clone https://github.com/flashinfer-ai/flashinfer.git --single-branch --branch v0.2.6.post1 --recursive && \
        # Needed to build AOT kernels
        (cd flashinfer && \
            python3 -m flashinfer.aot && \
@ -406,7 +286,7 @@ uv pip list
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/build.txt \
-        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### vLLM installation IMAGE ####################
@ -417,11 +297,6 @@ FROM vllm-base AS test
 ADD . /vllm-workspace/
 ARG PYTHON_VERSION
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
@ -432,7 +307,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/uv \   
    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
    if [ "$CUDA_MAJOR" -ge 12 ]; then \
        uv pip install --system -r requirements/dev.txt; \
@ -448,7 +323,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ENV HF_HUB_ENABLE_HF_TRANSFER 1
 # Copy in the v1 package for testing (it isn't distributed yet)
-COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
@ -465,9 +340,6 @@ RUN mv mkdocs.yaml test_docs/
 FROM vllm-base AS vllm-openai-base
 ARG TARGETPLATFORM
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -66,7 +66,7 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 WORKDIR /workspace/vllm
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
    uv pip install -r requirements/build.txt
 COPY . .
@ -79,22 +79,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
 ######################### TEST DEPS #########################
 FROM base AS vllm-test-deps
 WORKDIR /workspace/vllm
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
    cp requirements/test.in requirements/cpu-test.in && \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
    sed -i 's/torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -r requirements/cpu-test.txt 
 ######################### DEV IMAGE #########################
 FROM vllm-build AS vllm-dev
@ -113,19 +97,28 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
 COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
    cp requirements/test.in requirements/test-cpu.in && \
    sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
    uv pip compile requirements/test-cpu.in -o requirements/test.txt && \
    uv pip install -r requirements/dev.txt && \
    pre-commit install --hook-type pre-commit --hook-type commit-msg
 ENTRYPOINT ["bash"]
 ######################### TEST IMAGE #########################
-FROM vllm-test-deps AS vllm-test
+FROM base AS vllm-test
 WORKDIR /workspace/
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
    cp requirements/test.in requirements/test-cpu.in && \
    sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
    uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
    uv pip install -r requirements/cpu-test.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="6487649"
+ARG AITER_BRANCH="c1debd8"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 FROM ${BASE_IMAGE} AS base
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@ -35,7 +35,6 @@ RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 ENV VLLM_TARGET_DEVICE=xpu
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@ -48,12 +48,7 @@ nav:
    - General:
      - glob: contributing/*
        flatten_single_child_sections: true
-    - Model Implementation: 
+    - Model Implementation: contributing/model
      - contributing/model/README.md
      - contributing/model/basic.md
      - contributing/model/registration.md
      - contributing/model/tests.md
      - contributing/model/multimodal.md
    - Design Documents:
      - V0: design
      - V1: design/v1
--- a/docs/README.md
+++ b/docs/README.md
@ -40,7 +40,7 @@ vLLM is flexible and easy to use with:
 - OpenAI-compatible API server
 - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 - Prefix caching support
- Multi-LoRA support
+- Multi-lora support
 For more information, check out the following:
--- a/docs/ci/update_pytorch_version.md
+++ b/docs/ci/update_pytorch_version.md
@ -91,7 +91,7 @@ source to unblock the update process.
 ### FlashInfer
 Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
-```bash
+```
 export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
 export FLASHINFER_ENABLE_SM90=1
 uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
@ -105,14 +105,14 @@ team if you want to get the package published there.
 ### xFormers
 Similar to FlashInfer, here is how to build and install xFormers from source:
-```bash
+```
 export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
 MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
 ```
 ### Mamba
-```bash
+```
 uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 ```
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@ -16,33 +16,35 @@ vllm {chat,complete,serve,bench,collect-env,run-batch}
 Start the vLLM OpenAI Compatible API server.
-??? Examples
+Examples:
-    ```bash
+```bash
-    # Start with a model
+# Start with a model
-    vllm serve meta-llama/Llama-2-7b-hf
+vllm serve meta-llama/Llama-2-7b-hf
-    # Specify the port
+# Specify the port
-    vllm serve meta-llama/Llama-2-7b-hf --port 8100
+vllm serve meta-llama/Llama-2-7b-hf --port 8100
-    # Check with --help for more options
+# Check with --help for more options
-    # To list all groups
+# To list all groups
-    vllm serve --help=listgroup
+vllm serve --help=listgroup
-    # To view a argument group
+# To view a argument group
-    vllm serve --help=ModelConfig
+vllm serve --help=ModelConfig
-    # To view a single argument
+# To view a single argument
-    vllm serve --help=max-num-seqs
+vllm serve --help=max-num-seqs
-    # To search by keyword
+# To search by keyword
-    vllm serve --help=max
+vllm serve --help=max
-    ```
+```
 ## chat
 Generate chat completions via the running API server.
 Examples:
 ```bash
 # Directly connect to localhost API without arguments
 vllm chat
@ -58,6 +60,8 @@ vllm chat --quick "hi"
 Generate text completions based on the given prompt via the running API server.
 Examples:
 ```bash
 # Directly connect to localhost API without arguments
 vllm complete
@ -69,8 +73,6 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
 vllm complete --quick "The future of AI is"
 ```
 </details>
 ## bench
 Run benchmark tests for latency online serving throughput and offline inference throughput.
@ -87,6 +89,8 @@ vllm bench {latency, serve, throughput}
 Benchmark the latency of a single batch of requests.
 Example:
 ```bash
 vllm bench latency \
    --model meta-llama/Llama-3.2-1B-Instruct \
@ -100,6 +104,8 @@ vllm bench latency \
 Benchmark the online serving throughput.
 Example:
 ```bash
 vllm bench serve \
    --model meta-llama/Llama-3.2-1B-Instruct \
@ -114,6 +120,8 @@ vllm bench serve \
 Benchmark offline inference throughput.
 Example:
 ```bash
 vllm bench throughput \
    --model meta-llama/Llama-3.2-1B-Instruct \
@ -135,8 +143,7 @@ vllm collect-env
 Run batch prompts and write results to file.
-<details>
+Examples:
 <summary>Examples</summary>
 ```bash
 # Running with a local file
@ -152,8 +159,6 @@ vllm run-batch \
    --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 </details>
 ## More Help
 For detailed options of any subcommand, use:
--- a/docs/community/contact_us.md
+++ b/docs/community/contact_us.md
@ -1,6 +0,0 @@
 ---
 title: Contact Us
 ---
 [](){ #contactus }
 --8<-- "README.md:contact-us"
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@ -57,21 +57,19 @@ By default, we optimize model inference using CUDA graphs which take up extra me
 You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
-??? Code
+```python
 from vllm import LLM
 from vllm.config import CompilationConfig, CompilationLevel
-    ```python
+llm = LLM(
-    from vllm import LLM
+    model="meta-llama/Llama-3.1-8B-Instruct",
-    from vllm.config import CompilationConfig, CompilationLevel
+    compilation_config=CompilationConfig(
-
+        level=CompilationLevel.PIECEWISE,
-    llm = LLM(
+        # By default, it goes up to max_num_seqs
-        model="meta-llama/Llama-3.1-8B-Instruct",
+        cudagraph_capture_sizes=[1, 2, 4, 8, 16],
-        compilation_config=CompilationConfig(
+    ),
-            level=CompilationLevel.PIECEWISE,
+)
-            # By default, it goes up to max_num_seqs
+```
            cudagraph_capture_sizes=[1, 2, 4, 8, 16],
        ),
    )
    ```
 You can disable graph capturing completely via the `enforce_eager` flag:
@ -129,20 +127,18 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.
 Here are some examples:
-??? Code
+```python
 from vllm import LLM
-    ```python
+# Available for Qwen2-VL series models
-    from vllm import LLM
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
          mm_processor_kwargs={
              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
          })
-    # Available for Qwen2-VL series models
+# Available for InternVL series models
-    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+llm = LLM(model="OpenGVLab/InternVL2-2B",
-            mm_processor_kwargs={
+          mm_processor_kwargs={
-                "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+              "max_dynamic_patch": 4,  # Default is 12
-            })
+          })
-
+```
    # Available for InternVL series models
    llm = LLM(model="OpenGVLab/InternVL2-2B",
            mm_processor_kwargs={
                "max_dynamic_patch": 4,  # Default is 12
            })
    ```
--- a/docs/configuration/env_vars.md
+++ b/docs/configuration/env_vars.md
@ -7,8 +7,6 @@ vLLM uses the following environment variables to configure the system:
    All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
-??? Code
+```python
-
+--8<-- "vllm/envs.py:env-vars-definition"
-    ```python
+```
    --8<-- "vllm/envs.py:env-vars-definition"
    ```
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@ -29,8 +29,6 @@ See <gh-file:LICENSE>.
 Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
 Check out the [building from source][build-from-source] documentation for details.
 For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
 ### Building the docs with MkDocs
 #### Introduction to MkDocs
@ -95,27 +93,25 @@ For additional features and advanced configurations, refer to the official [MkDo
 ## Testing
-??? note "Commands"
+```bash
 pip install -r requirements/dev.txt
-    ```bash
+# Linting, formatting and static type checking
-    pip install -r requirements/dev.txt
+pre-commit install --hook-type pre-commit --hook-type commit-msg
-    # Linting, formatting and static type checking
+# You can manually run pre-commit with
-    pre-commit install --hook-type pre-commit --hook-type commit-msg
+pre-commit run --all-files
-    # You can manually run pre-commit with
+# To manually run something from CI that does not run
-    pre-commit run --all-files
+# locally by default, you can run:
 pre-commit run mypy-3.9 --hook-stage manual --all-files
-    # To manually run something from CI that does not run
+# Unit tests
-    # locally by default, you can run:
+pytest tests/
    pre-commit run mypy-3.9 --hook-stage manual --all-files
-    # Unit tests
+# Run tests for a single test file with detailed output
-    pytest tests/
+pytest -s -v tests/test_logger.py
-
+```
    # Run tests for a single test file with detailed output
    pytest -s -v tests/test_logger.py
    ```
 !!! tip
    Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
@ -151,14 +147,6 @@ the terms of the DCO.
 Using `-s` with `git commit` will automatically add this header.
 !!! tip
    You can enable automatic sign-off via your IDE:
    - **PyCharm**: Click on the `Show Commit Options` icon to the right of the `Commit and Push...` button in the `Commit` window.
      It will bring up a `git` window where you can modify the `Author` and enable `Sign-off commit`.
    - **VSCode**: Open the [Settings editor](https://code.visualstudio.com/docs/configure/settings)
      and enable the `Git: Always Sign Off` (`git.alwaysSignOff`) field.
 ### PR Title and Classification
 Only specific types of PRs will be reviewed. The PR title is prefixed
@ -198,7 +186,6 @@ The PR needs to meet the following code quality standards:
 ### Adding or Changing Kernels
 When actively developing or modifying kernels, using the [Incremental Compilation Workflow](./incremental_build.md) is highly recommended for faster build times.
 Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
 - Make sure custom ops are registered following PyTorch guidelines:
--- a/docs/contributing/incremental_build.md
+++ b/docs/contributing/incremental_build.md
@ -1,138 +0,0 @@
 # Incremental Compilation Workflow
 When working on vLLM's C++/CUDA kernels located in the `csrc/` directory, recompiling the entire project with `uv pip install -e .` for every change can be time-consuming. An incremental compilation workflow using CMake allows for faster iteration by only recompiling the necessary components after an initial setup. This guide details how to set up and use such a workflow, which complements your editable Python installation.
 ## Prerequisites
 Before setting up the incremental build:
 1. **vLLM Editable Install:** Ensure you have vLLM installed from source in an editable mode. Using pre-compiled wheels for the initial editable setup can be faster, as the CMake workflow will handle subsequent kernel recompilations.
    ```console
    uv venv --python 3.12 --seed
    source .venv/bin/activate
    VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
    ```
 2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
 3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.
    ```console
    uv pip install -r requirements/build.txt --torch-backend=auto
    ```
 ## Setting up the CMake Build Environment
 The incremental build process is managed through CMake. You can configure your build settings using a `CMakeUserPresets.json` file at the root of the vLLM repository.
 ### Generate `CMakeUserPresets.json` using the helper script
 To simplify the setup, vLLM provides a helper script that attempts to auto-detect your system's configuration (like CUDA path, Python environment, and CPU cores) and generates the `CMakeUserPresets.json` file for you.
 **Run the script:**
 Navigate to the root of your vLLM clone and execute the following command:
 ```console
 python tools/generate_cmake_presets.py
 ```
 The script will prompt you if it cannot automatically determine certain paths (e.g., `nvcc` or a specific Python executable for your vLLM development environment). Follow the on-screen prompts. If an existing `CMakeUserPresets.json` is found, the script will ask for confirmation before overwriting it.
 After running the script, a `CMakeUserPresets.json` file will be created in the root of your vLLM repository.
 ### Example `CMakeUserPresets.json`
 Below is an example of what the generated `CMakeUserPresets.json` might look like. The script will tailor these values based on your system and any input you provide.
 ```json
 {
    "version": 6,
    "cmakeMinimumRequired": {
        "major": 3,
        "minor": 26,
        "patch": 1
    },
    "configurePresets": [
        {
            "name": "release",
            "generator": "Ninja",
            "binaryDir": "${sourceDir}/cmake-build-release",
            "cacheVariables": {
                "CMAKE_CUDA_COMPILER": "/usr/local/cuda/bin/nvcc",
                "CMAKE_C_COMPILER_LAUNCHER": "ccache",
                "CMAKE_CXX_COMPILER_LAUNCHER": "ccache",
                "CMAKE_CUDA_COMPILER_LAUNCHER": "ccache",
                "CMAKE_BUILD_TYPE": "Release",
                "VLLM_PYTHON_EXECUTABLE": "/home/user/venvs/vllm/bin/python",
                "CMAKE_INSTALL_PREFIX": "${sourceDir}",
                "CMAKE_CUDA_FLAGS": "",
                "NVCC_THREADS": "4",
                "CMAKE_JOB_POOLS": "compile=32"
            }
        }
    ],
    "buildPresets": [
        {
            "name": "release",
            "configurePreset": "release",
            "jobs": 32
        }
    ]
 }
 ```
 **What do the various configurations mean?**
 - `CMAKE_CUDA_COMPILER`: Path to your `nvcc` binary. The script attempts to find this automatically.
 - `CMAKE_C_COMPILER_LAUNCHER`, `CMAKE_CXX_COMPILER_LAUNCHER`, `CMAKE_CUDA_COMPILER_LAUNCHER`: Setting these to `ccache` (or `sccache`) significantly speeds up rebuilds by caching compilation results. Ensure `ccache` is installed (e.g., `sudo apt install ccache` or `conda install ccache`). The script sets these by default.
 - `VLLM_PYTHON_EXECUTABLE`: Path to the Python executable in your vLLM development environment. The script will prompt for this, defaulting to the current Python environment if suitable.
 - `CMAKE_INSTALL_PREFIX: "${sourceDir}"`: Specifies that the compiled components should be installed back into your vLLM source directory. This is crucial for the editable install, as it makes the newly built kernels immediately available to your Python environment.
 - `CMAKE_JOB_POOLS` and `jobs` in build presets: Control the parallelism of the build. The script sets these based on the number of CPU cores detected on your system.
 - `binaryDir`: Specifies where the build artifacts will be stored (e.g., `cmake-build-release`).
 ## Building and Installing with CMake
 Once your `CMakeUserPresets.json` is configured:
 1. **Initialize the CMake build environment:**
   This step configures the build system according to your chosen preset (e.g., `release`) and creates the build directory at `binaryDir`
   ```console
   cmake --preset release
   ```
 2. **Build and install the vLLM components:**
   This command compiles the code and installs the resulting binaries into your vLLM source directory, making them available to your editable Python installation.
   ```console
   cmake --build --preset release --target install
   ```
 3. **Make changes and repeat!**
    Now you start using your editable install of vLLM, testing and making changes as needed. If you need to build again to update based on changes, simply run the CMake command again to build only the affected files.
    ```console
    cmake --build --preset release --target install
    ```
 ## Verifying the Build
 After a successful build, you will find a populated build directory (e.g., `cmake-build-release/` if you used the `release` preset and the example configuration).
 ```console
 > ls cmake-build-release/
 bin             cmake_install.cmake      _deps                                machete_generation.log
 build.ninja     CPackConfig.cmake        detect_cuda_compute_capabilities.cu  marlin_generation.log
 _C.abi3.so      CPackSourceConfig.cmake  detect_cuda_version.cc               _moe_C.abi3.so
 CMakeCache.txt  ctest                    _flashmla_C.abi3.so                  moe_marlin_generation.log
 CMakeFiles      cumem_allocator.abi3.so  install_local_manifest.txt           vllm-flash-attn
 ```
 The `cmake --build ... --target install` command copies the compiled shared libraries (like `_C.abi3.so`, `_moe_C.abi3.so`, etc.) into the appropriate `vllm` package directory within your source tree. This updates your editable installation with the newly compiled kernels.
 ## Additional Tips
 - **Adjust Parallelism:** Fine-tune the `CMAKE_JOB_POOLS` in `configurePresets` and `jobs` in `buildPresets` in your `CMakeUserPresets.json`. Too many jobs can overload systems with limited RAM or CPU cores, leading to slower builds or system instability. Too few won't fully utilize available resources.
 - **Clean Builds When Necessary:** If you encounter persistent or strange build errors, especially after significant changes or switching branches, consider removing the CMake build directory (e.g., `rm -rf cmake-build-release`) and re-running the `cmake --preset` and `cmake --build` commands.
 - **Specific Target Builds:** For even faster iterations when working on a specific module, you can sometimes build a specific target instead of the full `install` target, though `install` ensures all necessary components are updated in your Python environment. Refer to CMake documentation for more advanced target management.
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@ -1,23 +1,21 @@
 ---
-title: Summary
+title: Adding a New Model
 ---
 [](){ #new-model }
-!!! important
+This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
    Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
-vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features][compatibility-matrix] to optimize their performance.
+Contents:
-The complexity of integrating a model into vLLM depends heavily on the model's architecture.
+- [Basic](basic.md)
-The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+- [Registration](registration.md)
-However, this can be more complex for models that include new operators (e.g., a new attention mechanism).
+- [Tests](tests.md)
 - [Multimodal](multimodal.md)
-Read through these pages for a step-by-step guide:
+!!! note
-
+    The complexity of adding a new model depends heavily on the model's architecture.
- [Basic Model](basic.md)
+    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
- [Registering a Model](registration.md)
+    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
 - [Unit Testing](tests.md)
 - [Multi-Modal Support](multimodal.md)
 !!! tip
    If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -1,5 +1,5 @@
 ---
-title: Basic Model
+title: Implementing a Basic Model
 ---
 [](){ #new-model-basic }
@ -27,35 +27,33 @@ All vLLM modules within the model must include a `prefix` argument in their cons
 The initialization code should look like this:
-??? Code
+```python
 from torch import nn
 from vllm.config import VllmConfig
 from vllm.attention import Attention
-    ```python
+class MyAttention(nn.Module):
-    from torch import nn
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
-    from vllm.config import VllmConfig
+        super().__init__()
-    from vllm.attention import Attention
+        self.attn = Attention(prefix=f"{prefix}.attn")
-    class MyAttention(nn.Module):
+class MyDecoderLayer(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
-            super().__init__()
+        super().__init__()
-            self.attn = Attention(prefix=f"{prefix}.attn")
+        self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
-    class MyDecoderLayer(nn.Module):
+class MyModel(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
-            super().__init__()
+        super().__init__()
-            self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
+        self.layers = nn.ModuleList(
            [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
        )
-    class MyModel(nn.Module):
+class MyModelForCausalLM(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
-            super().__init__()
+        super().__init__()
-            self.layers = nn.ModuleList(
+        self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
-                [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
+```
            )
    class MyModelForCausalLM(nn.Module):
        def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
            super().__init__()
            self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
    ```
 ### Computation Code
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -25,63 +25,59 @@ Further update the model as follows:
 - Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
-    ??? Code
+    ```python
    class YourModelForImage2Seq(nn.Module):
        ...
-        ```python
+        def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
        class YourModelForImage2Seq(nn.Module):
            ...
-            def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
+            assert self.vision_encoder is not None
            image_features = self.vision_encoder(image_input)
            return self.multi_modal_projector(image_features)
-                assert self.vision_encoder is not None
+        def get_multimodal_embeddings(
-                image_features = self.vision_encoder(image_input)
+                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
                return self.multi_modal_projector(image_features)
-            def get_multimodal_embeddings(
+            # Validate the multimodal input keyword arguments
-                    self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+            image_input = self._parse_and_validate_image_input(**kwargs)
            if image_input is None:
                return None
-                # Validate the multimodal input keyword arguments
+            # Run multimodal inputs through encoder and projector
-                image_input = self._parse_and_validate_image_input(**kwargs)
+            vision_embeddings = self._process_image_input(image_input)
-                if image_input is None:
+            return vision_embeddings
-                    return None
+    ```
                # Run multimodal inputs through encoder and projector
                vision_embeddings = self._process_image_input(image_input)
                return vision_embeddings
        ```
 !!! important
    The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
 - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
-    ??? Code
+    ```python
    from .utils import merge_multimodal_embeddings
-        ```python
+    class YourModelForImage2Seq(nn.Module):
-        from .utils import merge_multimodal_embeddings
+        ...
-        class YourModelForImage2Seq(nn.Module):
+        def get_input_embeddings(
-            ...
+            self,
            input_ids: torch.Tensor,
            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
        ) -> torch.Tensor:
-            def get_input_embeddings(
+            # `get_input_embeddings` should already be implemented for the language 
-                self,
+            # model as one of the requirements of basic vLLM model implementation.
-                input_ids: torch.Tensor,
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
                multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
            ) -> torch.Tensor:
-                # `get_input_embeddings` should already be implemented for the language 
+            if multimodal_embeddings is not None:
-                # model as one of the requirements of basic vLLM model implementation.
+                inputs_embeds = merge_multimodal_embeddings(
-                inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+                    input_ids=input_ids, 
                    inputs_embeds=inputs_embeds, 
                    multimodal_embeddings=multimodal_embeddings,
                    placeholder_token_id=self.config.image_token_index)
-                if multimodal_embeddings is not None:
+            return inputs_embeds
-                    inputs_embeds = merge_multimodal_embeddings(
+    ```
                        input_ids=input_ids, 
                        inputs_embeds=inputs_embeds, 
                        multimodal_embeddings=multimodal_embeddings,
                        placeholder_token_id=self.config.image_token_index)
                return inputs_embeds
        ```
 - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
@ -139,46 +135,42 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
    Looking at the code of HF's `LlavaForConditionalGeneration`:
-    ??? Code
+    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
    n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
    n_image_features = image_features.shape[0] * image_features.shape[1]
-        ```python
+    if n_image_tokens != n_image_features:
-        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
+        raise ValueError(
-        n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
        n_image_features = image_features.shape[0] * image_features.shape[1]
        if n_image_tokens != n_image_features:
            raise ValueError(
                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
            )
        special_image_mask = (
            (input_ids == self.config.image_token_index)
            .unsqueeze(-1)
            .expand_as(inputs_embeds)
            .to(inputs_embeds.device)
        )
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+    special_image_mask = (
-        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        (input_ids == self.config.image_token_index)
-        ```
+        .unsqueeze(-1)
        .expand_as(inputs_embeds)
        .to(inputs_embeds.device)
    )
    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
    ```
    The number of placeholder feature tokens per image is `image_features.shape[1]`.
    `image_features` is calculated inside the `get_image_features` method:
-    ??? Code
+    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
    image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-        ```python
+    selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
+    if vision_feature_select_strategy == "default":
-        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = selected_image_feature[:, 1:]
-
+    elif vision_feature_select_strategy == "full":
-        selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+        selected_image_feature = selected_image_feature
-        if vision_feature_select_strategy == "default":
+    else:
-            selected_image_feature = selected_image_feature[:, 1:]
+        raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
-        elif vision_feature_select_strategy == "full":
+    image_features = self.multi_modal_projector(selected_image_feature)
-            selected_image_feature = selected_image_feature
+    return image_features
-        else:
+    ```
            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
        image_features = self.multi_modal_projector(selected_image_feature)
        return image_features
        ```
    We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
    (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
@ -201,22 +193,20 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
    To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
-    ??? Code
+    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
    target_dtype = self.patch_embedding.weight.dtype
    patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
    patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
-        ```python
+    class_embeds = self.class_embedding.expand(batch_size, 1, -1)
-        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
+    embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-        target_dtype = self.patch_embedding.weight.dtype
+    if interpolate_pos_encoding:
-        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
-        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+    else:
-
+        embeddings = embeddings + self.position_embedding(self.position_ids)
-        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+    return embeddings
-        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+    ```
        if interpolate_pos_encoding:
            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
        else:
            embeddings = embeddings + self.position_embedding(self.position_ids)
        return embeddings
        ```
    We can infer that `embeddings.shape[1] == self.num_positions`, where
@ -228,59 +218,55 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
    Overall, the number of placeholder feature tokens for an image can be calculated as:
-    ??? Code
+    ```python
    def get_num_image_tokens(
        self,
        *,
        image_width: int,
        image_height: int,
    ) -> int:
        hf_config = self.get_hf_config()
        hf_processor = self.get_hf_processor()
-        ```python
+        image_size = hf_config.vision_config.image_size
-        def get_num_image_tokens(
+        patch_size = hf_config.vision_config.patch_size
            self,
            *,
            image_width: int,
            image_height: int,
        ) -> int:
            hf_config = self.get_hf_config()
            hf_processor = self.get_hf_processor()
-            image_size = hf_config.vision_config.image_size
+        num_image_tokens = (image_size // patch_size) ** 2 + 1
-            patch_size = hf_config.vision_config.patch_size
+        if hf_processor.vision_feature_select_strategy == "default":
            num_image_tokens -= 1
-            num_image_tokens = (image_size // patch_size) ** 2 + 1
+        return num_image_tokens
-            if hf_processor.vision_feature_select_strategy == "default":
+    ```
                num_image_tokens -= 1
            return num_image_tokens
        ```
    Notice that the number of image tokens doesn't depend on the image width and height.
    We can simply use a dummy `image_size` to calculate the multimodal profiling data:
-    ??? Code
+    ```python
    # NOTE: In actuality, this is usually implemented as part of the
    # model's subclass of `BaseProcessingInfo`, but we show it as is
    # here for simplicity.
    def get_image_size_with_most_features(self) -> ImageSize:
        hf_config = self.get_hf_config()
        width = height = hf_config.image_size
        return ImageSize(width=width, height=height)
-        ```python
+    def get_dummy_mm_data(
-        # NOTE: In actuality, this is usually implemented as part of the
+        self,
-        # model's subclass of `BaseProcessingInfo`, but we show it as is
+        seq_len: int,
-        # here for simplicity.
+        mm_counts: Mapping[str, int],
-        def get_image_size_with_most_features(self) -> ImageSize:
+    ) -> MultiModalDataDict:
-            hf_config = self.get_hf_config()
+        num_images = mm_counts.get("image", 0)
            width = height = hf_config.image_size
            return ImageSize(width=width, height=height)
-        def get_dummy_mm_data(
+        target_width, target_height = \
-            self,
+            self.info.get_image_size_with_most_features()
            seq_len: int,
            mm_counts: Mapping[str, int],
        ) -> MultiModalDataDict:
            num_images = mm_counts.get("image", 0)
-            target_width, target_height = \
+        return {
-                self.info.get_image_size_with_most_features()
+            "image":
-
+            self._get_dummy_images(width=target_width,
-            return {
+                                   height=target_height,
-                "image":
+                                   num_images=num_images)
-                self._get_dummy_images(width=target_width,
+        }
-                                    height=target_height,
+    ```
                                    num_images=num_images)
            }
        ```
    For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
@ -298,23 +284,21 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
    Looking at the code of HF's `FuyuForCausalLM`:
-    ??? Code
+    ```python
-
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
-        ```python
+    if image_patches is not None and past_key_values is None:
-        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
+        patch_embeddings = [
-        if image_patches is not None and past_key_values is None:
+            self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
-            patch_embeddings = [
+            .squeeze(0)
-                self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
+            .to(inputs_embeds.device)
-                .squeeze(0)
+            for patch in image_patches
-                .to(inputs_embeds.device)
+        ]
-                for patch in image_patches
+        inputs_embeds = self.gather_continuous_embeddings(
-            ]
+            word_embeddings=inputs_embeds,
-            inputs_embeds = self.gather_continuous_embeddings(
+            continuous_embeddings=patch_embeddings,
-                word_embeddings=inputs_embeds,
+            image_patch_input_indices=image_patches_indices,
-                continuous_embeddings=patch_embeddings,
+        )
-                image_patch_input_indices=image_patches_indices,
+    ```
            )
        ```
    The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
    which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
@ -328,98 +312,92 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
    In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
    returning the dimensions after resizing (but before padding) as metadata.
-    ??? Code
+    ```python
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
    image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
    batch_images = image_encoding["images"]
    image_unpadded_heights = image_encoding["image_unpadded_heights"]
    image_unpadded_widths = image_encoding["image_unpadded_widths"]
-        ```python
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
-        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
+    if do_resize:
-        image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
+        batch_images = [
-        batch_images = image_encoding["images"]
+            [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
-        image_unpadded_heights = image_encoding["image_unpadded_heights"]
+            for images in batch_images
-        image_unpadded_widths = image_encoding["image_unpadded_widths"]
+        ]
-        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
+    image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
-        if do_resize:
+    image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
-            batch_images = [
+    image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
-                [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
+
-                for images in batch_images
+    if do_pad:
        batch_images = [
            [
                self.pad_image(
                    image,
                    size=size,
                    mode=padding_mode,
                    constant_values=padding_value,
                    input_data_format=input_data_format,
                )
                for image in images
            ]
-
+            for images in batch_images
-        image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
+        ]
-        image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
+    ```
        image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
        if do_pad:
            batch_images = [
                [
                    self.pad_image(
                        image,
                        size=size,
                        mode=padding_mode,
                        constant_values=padding_value,
                        input_data_format=input_data_format,
                    )
                    for image in images
                ]
                for images in batch_images
            ]
        ```
    In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
-    ??? Code
+    ```python
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
    model_image_input = self.image_processor.preprocess_with_tokenizer_info(
        image_input=tensor_batch_images,
        image_present=image_present,
        image_unpadded_h=image_unpadded_heights,
        image_unpadded_w=image_unpadded_widths,
        image_placeholder_id=image_placeholder_id,
        image_newline_id=image_newline_id,
        variable_sized=True,
    )
-        ```python
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
-        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
+    image_height, image_width = image.shape[1], image.shape[2]
-        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+    if variable_sized:  # variable_sized=True
-            image_input=tensor_batch_images,
+        new_h = min(
-            image_present=image_present,
+            image_height,
-            image_unpadded_h=image_unpadded_heights,
+            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
            image_unpadded_w=image_unpadded_widths,
            image_placeholder_id=image_placeholder_id,
            image_newline_id=image_newline_id,
            variable_sized=True,
        )
-
+        new_w = min(
-        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
+            image_width,
-        image_height, image_width = image.shape[1], image.shape[2]
+            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
        if variable_sized:  # variable_sized=True
            new_h = min(
                image_height,
                math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
            )
            new_w = min(
                image_width,
                math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
            )
            image = image[:, :new_h, :new_w]
            image_height, image_width = new_h, new_w
        num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
        tensor_of_image_ids = torch.full(
            [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
        )
-        patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
+        image = image[:, :new_h, :new_w]
-        assert num_patches == patches.shape[0]
+        image_height, image_width = new_h, new_w
-        ```
+
    num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
    tensor_of_image_ids = torch.full(
        [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
    )
    patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
    assert num_patches == patches.shape[0]
    ```
    The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
-    ??? Code
+    ```python
    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
    patch_size = patch_size if patch_size is not None else self.patch_size
    patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
-        ```python
+    if image_height % patch_height != 0:
-        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
+        raise ValueError(f"{image_height=} must be divisible by {patch_height}")
-        patch_size = patch_size if patch_size is not None else self.patch_size
+    if image_width % patch_width != 0:
-        patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+        raise ValueError(f"{image_width=} must be divisible by {patch_width}")
-        if image_height % patch_height != 0:
+    num_patches_per_dim_h = image_height // patch_height
-            raise ValueError(f"{image_height=} must be divisible by {patch_height}")
+    num_patches_per_dim_w = image_width // patch_width
-        if image_width % patch_width != 0:
+    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
-            raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+    ```
        num_patches_per_dim_h = image_height // patch_height
        num_patches_per_dim_w = image_width // patch_width
        num_patches = num_patches_per_dim_h * num_patches_per_dim_w
        ```
    These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
    to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
@ -441,25 +419,23 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
    For the multimodal image profiling data, the logic is very similar to LLaVA:
-    ??? Code
+    ```python
    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> MultiModalDataDict:
        target_width, target_height = \
            self.info.get_image_size_with_most_features()
        num_images = mm_counts.get("image", 0)
-        ```python
+        return {
-        def get_dummy_mm_data(
+            "image":
-            self,
+            self._get_dummy_images(width=target_width,
-            seq_len: int,
+                                   height=target_height,
-            mm_counts: Mapping[str, int],
+                                   num_images=num_images)
-        ) -> MultiModalDataDict:
+        }
-            target_width, target_height = \
+    ```
                self.info.get_image_size_with_most_features()
            num_images = mm_counts.get("image", 0)
            return {
                "image":
                self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
            }
        ```
 ## 4. Specify processing details
@ -479,7 +455,6 @@ return a schema of the tensors outputted by the HF processor that are related to
    The output of `CLIPImageProcessor` is a simple tensor with shape
    `(num_images, num_channels, image_height, image_width)`:
    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
    images = [
@ -530,37 +505,35 @@ return a schema of the tensors outputted by the HF processor that are related to
    In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
    we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
-    ??? Code
+    ```python
    def _call_hf_processor(
        self,
        prompt: str,
        mm_data: Mapping[str, object],
        mm_kwargs: Mapping[str, object],
    ) -> BatchFeature:
        processed_outputs = super()._call_hf_processor(
            prompt=prompt,
            mm_data=mm_data,
            mm_kwargs=mm_kwargs,
        )
-        ```python
+        image_patches = processed_outputs.get("image_patches")
-        def _call_hf_processor(
+        if image_patches is not None:
-            self,
+            images = mm_data["images"]
-            prompt: str,
+            assert isinstance(images, list)
            mm_data: Mapping[str, object],
            mm_kwargs: Mapping[str, object],
        ) -> BatchFeature:
            processed_outputs = super()._call_hf_processor(
                prompt=prompt,
                mm_data=mm_data,
                mm_kwargs=mm_kwargs,
            )
-            image_patches = processed_outputs.get("image_patches")
+            # Original output: (1, num_images, Pn, Px * Py * C)
-            if image_patches is not None:
+            # New output: (num_images, Pn, Px * Py * C)
-                images = mm_data["images"]
+            assert (isinstance(image_patches, list)
-                assert isinstance(images, list)
+                    and len(image_patches) == 1)
            assert (isinstance(image_patches[0], torch.Tensor)
                    and len(image_patches[0]) == len(images))
-                # Original output: (1, num_images, Pn, Px * Py * C)
+            processed_outputs["image_patches"] = image_patches[0]
                # New output: (num_images, Pn, Px * Py * C)
                assert (isinstance(image_patches, list)
                        and len(image_patches) == 1)
                assert (isinstance(image_patches[0], torch.Tensor)
                        and len(image_patches[0]) == len(images))
-                processed_outputs["image_patches"] = image_patches[0]
+        return processed_outputs
-
+    ```
            return processed_outputs
        ```
    !!! note
        Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
@ -600,37 +573,35 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
    It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
    Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
-    ??? Code
+    ```python
    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargs,
    ) -> Sequence[PromptUpdate]:
        hf_config = self.info.get_hf_config()
        image_token_id = hf_config.image_token_index
-        ```python
+        def get_replacement(item_idx: int):
-        def _get_prompt_updates(
+            images = mm_items.get_items("image", ImageProcessorItems)
            self,
            mm_items: MultiModalDataItems,
            hf_processor_mm_kwargs: Mapping[str, object],
            out_mm_kwargs: MultiModalKwargs,
        ) -> Sequence[PromptUpdate]:
            hf_config = self.info.get_hf_config()
            image_token_id = hf_config.image_token_index
-            def get_replacement(item_idx: int):
+            image_size = images.get_image_size(item_idx)
-                images = mm_items.get_items("image", ImageProcessorItems)
+            num_image_tokens = self.info.get_num_image_tokens(
                image_width=image_size.width,
                image_height=image_size.height,
            )
-                image_size = images.get_image_size(item_idx)
+            return [image_token_id] * num_image_tokens
                num_image_tokens = self.info.get_num_image_tokens(
                    image_width=image_size.width,
                    image_height=image_size.height,
                )
-                return [image_token_id] * num_image_tokens
+        return [
-
+            PromptReplacement(
-            return [
+                modality="image",
-                PromptReplacement(
+                target=[image_token_id],
-                    modality="image",
+                replacement=get_replacement,
-                    target=[image_token_id],
+            ),
-                    replacement=get_replacement,
+        ]
-                ),
+    ```
            ]
        ```
 === "Handling additional tokens: Fuyu"
@ -645,90 +616,117 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
    We define a helper function to return `ncols` and `nrows` directly:
-    ??? Code
+    ```python
    def get_image_feature_grid_size(
        self,
        *,
        image_width: int,
        image_height: int,
    ) -> tuple[int, int]:
        image_processor = self.get_image_processor()
        target_width = image_processor.size["width"]
        target_height = image_processor.size["height"]
        patch_width = image_processor.patch_size["width"]
        patch_height = image_processor.patch_size["height"]
-        ```python
+        if not (image_width <= target_width and image_height <= target_height):
-        def get_image_feature_grid_size(
+            height_scale_factor = target_height / image_height
-            self,
+            width_scale_factor = target_width / image_width
-            *,
+            optimal_scale_factor = min(height_scale_factor, width_scale_factor)
            image_width: int,
            image_height: int,
        ) -> tuple[int, int]:
            image_processor = self.get_image_processor()
            target_width = image_processor.size["width"]
            target_height = image_processor.size["height"]
            patch_width = image_processor.patch_size["width"]
            patch_height = image_processor.patch_size["height"]
-            if not (image_width <= target_width and image_height <= target_height):
+            image_height = int(image_height * optimal_scale_factor)
-                height_scale_factor = target_height / image_height
+            image_width = int(image_width * optimal_scale_factor)
                width_scale_factor = target_width / image_width
                optimal_scale_factor = min(height_scale_factor, width_scale_factor)
-                image_height = int(image_height * optimal_scale_factor)
+        ncols = math.ceil(image_width / patch_width)
-                image_width = int(image_width * optimal_scale_factor)
+        nrows = math.ceil(image_height / patch_height)
-
+        return ncols, nrows
-            ncols = math.ceil(image_width / patch_width)
+    ```
            nrows = math.ceil(image_height / patch_height)
            return ncols, nrows
        ```
    Based on this, we can initially define our replacement tokens as:
-    ??? Code
+    ```python
    def get_replacement(item_idx: int):
        images = mm_items.get_items("image", ImageProcessorItems)
        image_size = images.get_image_size(item_idx)
-        ```python
+        ncols, nrows = self.info.get_image_feature_grid_size(
-        def get_replacement(item_idx: int):
+            image_width=image_size.width,
-            images = mm_items.get_items("image", ImageProcessorItems)
+            image_height=image_size.height,
-            image_size = images.get_image_size(item_idx)
+        )
-            ncols, nrows = self.info.get_image_feature_grid_size(
+        # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
-                image_width=image_size.width,
+        # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
-                image_height=image_size.height,
+        return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
-            )
+    ```
            # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
            # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
            return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
        ```
    However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
    a BOS token (`<s>`) is also added to the promopt:
-    ??? Code
+    ```python
-
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
-        ```python
+    model_image_input = self.image_processor.preprocess_with_tokenizer_info(
-        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
+        image_input=tensor_batch_images,
-        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+        image_present=image_present,
-            image_input=tensor_batch_images,
+        image_unpadded_h=image_unpadded_heights,
-            image_present=image_present,
+        image_unpadded_w=image_unpadded_widths,
-            image_unpadded_h=image_unpadded_heights,
+        image_placeholder_id=image_placeholder_id,
-            image_unpadded_w=image_unpadded_widths,
+        image_newline_id=image_newline_id,
-            image_placeholder_id=image_placeholder_id,
+        variable_sized=True,
-            image_newline_id=image_newline_id,
+    )
-            variable_sized=True,
+    prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
-        )
+        tokenizer=self.tokenizer,
-        prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
+        prompts=prompts,
-            tokenizer=self.tokenizer,
+        scale_factors=scale_factors,
-            prompts=prompts,
+        max_tokens_to_generate=self.max_tokens_to_generate,
-            scale_factors=scale_factors,
+        max_position_embeddings=self.max_position_embeddings,
-            max_tokens_to_generate=self.max_tokens_to_generate,
+        add_BOS=True,
-            max_position_embeddings=self.max_position_embeddings,
+        add_beginning_of_answer_token=True,
-            add_BOS=True,
+    )
-            add_beginning_of_answer_token=True,
+    ```
        )
        ```
    To assign the vision embeddings to only the image tokens, instead of a string
    you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
-    ??? Code
+    ```python
    hf_config = self.info.get_hf_config()
    bos_token_id = hf_config.bos_token_id  # `<s>`
    assert isinstance(bos_token_id, int)
-        ```python
+    def get_replacement_fuyu(item_idx: int):
        images = mm_items.get_items("image", ImageProcessorItems)
        image_size = images.get_image_size(item_idx)
        ncols, nrows = self.info.get_image_feature_grid_size(
            image_width=image_size.width,
            image_height=image_size.height,
        )
        image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                        [_NEWLINE_TOKEN_ID]) * nrows
        return PromptUpdateDetails.select_token_id(
            image_tokens + [bos_token_id],
            embed_token_id=_IMAGE_TOKEN_ID,
        )
    ```
    Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
    we can search for it to conduct the replacement at the start of the string:
    ```python
    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargs,
    ) -> Sequence[PromptUpdate]:
        hf_config = self.info.get_hf_config()
-        bos_token_id = hf_config.bos_token_id  # `<s>`
+        bos_token_id = hf_config.bos_token_id
        assert isinstance(bos_token_id, int)
        tokenizer = self.info.get_tokenizer()
        eot_token_id = tokenizer.bos_token_id
        assert isinstance(eot_token_id, int)
        def get_replacement_fuyu(item_idx: int):
            images = mm_items.get_items("image", ImageProcessorItems)
            image_size = images.get_image_size(item_idx)
@ -744,52 +742,15 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                image_tokens + [bos_token_id],
                embed_token_id=_IMAGE_TOKEN_ID,
            )
        ```
-    Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
+        return [
-    we can search for it to conduct the replacement at the start of the string:
+            PromptReplacement(
-
+                modality="image",
-    ??? Code
+                target=[eot_token_id],
-
+                replacement=get_replacement_fuyu,
-        ```python
+            )
-        def _get_prompt_updates(
+        ]
-            self,
+    ```
            mm_items: MultiModalDataItems,
            hf_processor_mm_kwargs: Mapping[str, object],
            out_mm_kwargs: MultiModalKwargs,
        ) -> Sequence[PromptUpdate]:
            hf_config = self.info.get_hf_config()
            bos_token_id = hf_config.bos_token_id
            assert isinstance(bos_token_id, int)
            tokenizer = self.info.get_tokenizer()
            eot_token_id = tokenizer.bos_token_id
            assert isinstance(eot_token_id, int)
            def get_replacement_fuyu(item_idx: int):
                images = mm_items.get_items("image", ImageProcessorItems)
                image_size = images.get_image_size(item_idx)
                ncols, nrows = self.info.get_image_feature_grid_size(
                    image_width=image_size.width,
                    image_height=image_size.height,
                )
                image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                                [_NEWLINE_TOKEN_ID]) * nrows
                return PromptUpdateDetails.select_token_id(
                    image_tokens + [bos_token_id],
                    embed_token_id=_IMAGE_TOKEN_ID,
                )
            return [
                PromptReplacement(
                    modality="image",
                    target=[eot_token_id],
                    replacement=get_replacement_fuyu,
                )
            ]
        ```
 ## 5. Register processor-related classes
--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@ -1,5 +1,5 @@
 ---
-title: Registering a Model
+title: Registering a Model to vLLM
 ---
 [](){ #new-model-registration }
--- a/docs/contributing/model/tests.md
+++ b/docs/contributing/model/tests.md
@ -1,5 +1,5 @@
 ---
-title: Unit Testing
+title: Writing Unit Tests
 ---
 [](){ #new-model-tests }
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -30,21 +30,13 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example
 #### OpenAI Server
 ```bash
-VLLM_TORCH_PROFILER_DIR=./vllm_profile \
+VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
    python -m vllm.entrypoints.openai.api_server \
    --model meta-llama/Meta-Llama-3-70B
 ```
 benchmark_serving.py:
 ```bash
-python benchmarks/benchmark_serving.py \
+python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
    --backend vllm \
    --model meta-llama/Meta-Llama-3-70B \
    --dataset-name sharegpt \
    --dataset-path sharegpt.json \
    --profile \
    --num-prompts 2
 ```
 ## Profile with NVIDIA Nsight Systems
@ -72,16 +64,7 @@ For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fo
 The following is an example using the `benchmarks/benchmark_latency.py` script:
 ```bash
-nsys profile -o report.nsys-rep \
+nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8
    --trace-fork-before-exec=true \
    --cuda-graph-trace=node \
    python benchmarks/benchmark_latency.py \
    --model meta-llama/Llama-3.1-8B-Instruct \
    --num-iters-warmup 5 \
    --num-iters 1 \
    --batch-size 16 \
    --input-len 512 \
    --output-len 8
 ```
 #### OpenAI Server
@ -90,21 +73,10 @@ To profile the server, you will want to prepend your `vllm serve` command with `
 ```bash
 # server
-nsys profile -o report.nsys-rep \
+nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct
    --trace-fork-before-exec=true \
    --cuda-graph-trace=node \
    --delay 30 \
    --duration 60 \
    vllm serve meta-llama/Llama-3.1-8B-Instruct
 # client
-python benchmarks/benchmark_serving.py \
+python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512
    --backend vllm \
    --model meta-llama/Llama-3.1-8B-Instruct \
    --num-prompts 1 \
    --dataset-name random \
    --random-input 1024 \
    --random-output 512
 ```
 In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
@ -125,26 +97,26 @@ to manually kill the profiler and generate your `nsys-rep` report.
 You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
-??? CLI example
+CLI example:
-    ```bash
+```bash
-    nsys stats report1.nsys-rep
+nsys stats report1.nsys-rep
-    ...
+...
-    ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
+ ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
-    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
+ Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
-    --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
+ --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
-        46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
+     46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
-        14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
+     14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
-        12.1    2,692,284,876     14,280    188,535.4     83,904.0    19,328  2,862,237    497,999.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
+     12.1    2,692,284,876     14,280    188,535.4     83,904.0    19,328  2,862,237    497,999.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
-        9.5    2,116,600,578     33,920     62,399.8     21,504.0    15,326  2,532,285    290,954.1  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
+      9.5    2,116,600,578     33,920     62,399.8     21,504.0    15,326  2,532,285    290,954.1  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
-        5.0    1,119,749,165     18,912     59,208.4      9,056.0     6,784  2,578,366    271,581.7  void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
+      5.0    1,119,749,165     18,912     59,208.4      9,056.0     6,784  2,578,366    271,581.7  void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
-        4.1      916,662,515     21,312     43,011.6     19,776.0     8,928  2,586,205    199,790.1  void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
+      4.1      916,662,515     21,312     43,011.6     19,776.0     8,928  2,586,205    199,790.1  void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
-        2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
+      2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
-        1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
+      1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
-        0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
+      0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
-    ... 
+... 
-    ```
+```
 GUI example:
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@ -10,7 +10,7 @@ title: Using Docker
 vLLM offers an official Docker image for deployment.
 The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
-```bash
+```console
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
@ -22,7 +22,7 @@ docker run --runtime nvidia --gpus all \
 This image can also be used with other container engines such as [Podman](https://podman.io/).
-```bash
+```console
 podman run --gpus all \
  -v ~/.cache/huggingface:/root/.cache/huggingface \
  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
@ -71,7 +71,7 @@ You can add any other [engine-args][engine-args] you need after the image tag (`
 You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
-```bash
+```console
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
 DOCKER_BUILDKIT=1 docker build . \
    --target vllm-openai \
@ -97,28 +97,26 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
    flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
    Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
-??? Command
+```console
-
+# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-    ```bash
+python3 use_existing_torch.py
-    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+DOCKER_BUILDKIT=1 docker build . \
-    python3 use_existing_torch.py
+  --file docker/Dockerfile \
-    DOCKER_BUILDKIT=1 docker build . \
+  --target vllm-openai \
-    --file docker/Dockerfile \
+  --platform "linux/arm64" \
-    --target vllm-openai \
+  -t vllm/vllm-gh200-openai:latest \
-    --platform "linux/arm64" \
+  --build-arg max_jobs=66 \
-    -t vllm/vllm-gh200-openai:latest \
+  --build-arg nvcc_threads=2 \
-    --build-arg max_jobs=66 \
+  --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
-    --build-arg nvcc_threads=2 \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
+```
    --build-arg vllm_fa_cmake_gpu_arches="90-real"
    ```
 !!! note
    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
    Run the following command on your host machine to register QEMU user static handlers:
-    ```bash
+    ```console
    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
    ```
@ -128,7 +126,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 To run vLLM with the custom-built Docker image:
-```bash
+```console
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -p 8000:8000 \
--- a/docs/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@ -15,7 +15,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
 - Start the vLLM server with the supported chat completion model, e.g.
-```bash
+```console
 vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
 ```
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@ -11,7 +11,7 @@ title: AutoGen
 - Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment
-```bash
+```console
 pip install vllm
 # Install AgentChat and OpenAI client from Extensions
@ -23,60 +23,58 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
 - Start the vLLM server with the supported chat completion model, e.g.
-```bash
+```console
 python -m vllm.entrypoints.openai.api_server \
    --model mistralai/Mistral-7B-Instruct-v0.2
 ```
 - Call it with AutoGen:
-??? Code
+```python
-
+import asyncio
-    ```python
+from autogen_core.models import UserMessage
-    import asyncio
+from autogen_ext.models.openai import OpenAIChatCompletionClient
-    from autogen_core.models import UserMessage
+from autogen_core.models import ModelFamily
    from autogen_ext.models.openai import OpenAIChatCompletionClient
    from autogen_core.models import ModelFamily
-    async def main() -> None:
+async def main() -> None:
-        # Create a model client
+    # Create a model client
-        model_client = OpenAIChatCompletionClient(
+    model_client = OpenAIChatCompletionClient(
-            model="mistralai/Mistral-7B-Instruct-v0.2",
+        model="mistralai/Mistral-7B-Instruct-v0.2",
-            base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
+        base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
-            api_key="EMPTY",
+        api_key="EMPTY",
-            model_info={
+        model_info={
-                "vision": False,
+            "vision": False,
-                "function_calling": False,
+            "function_calling": False,
-                "json_output": False,
+            "json_output": False,
-                "family": ModelFamily.MISTRAL,
+            "family": ModelFamily.MISTRAL,
-                "structured_output": True,
+            "structured_output": True,
-            },
+        },
-        )
+    )
-        messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
+    messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
-        # Create a stream.
+    # Create a stream.
-        stream = model_client.create_stream(messages=messages)
+    stream = model_client.create_stream(messages=messages)
-        # Iterate over the stream and print the responses.
+    # Iterate over the stream and print the responses.
-        print("Streamed responses:")
+    print("Streamed responses:")
-        async for response in stream:
+    async for response in stream:
-            if isinstance(response, str):
+        if isinstance(response, str):
-                # A partial response is a string.
+            # A partial response is a string.
-                print(response, flush=True, end="")
+            print(response, flush=True, end="")
-            else:
+        else:
-                # The last response is a CreateResult object with the complete message.
+            # The last response is a CreateResult object with the complete message.
-                print("\n\n------------\n")
+            print("\n\n------------\n")
-                print("The complete response:", flush=True)
+            print("The complete response:", flush=True)
-                print(response.content, flush=True)
+            print(response.content, flush=True)
-        # Close the client when done.
+    # Close the client when done.
-        await model_client.close()
+    await model_client.close()
-    asyncio.run(main())
+asyncio.run(main())
-    ```
+```
 For details, see the tutorial:
--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@ -11,14 +11,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr
 To install the Cerebrium client, run:
-```bash
+```console
 pip install cerebrium
 cerebrium login
 ```
 Next, create your Cerebrium project, run:
-```bash
+```console
 cerebrium init vllm-project
 ```
@ -34,81 +34,75 @@ vllm = "latest"
 Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
-??? Code
+```python
 from vllm import LLM, SamplingParams
-    ```python
+llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
    from vllm import LLM, SamplingParams
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
+def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
-    def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
+    sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
    outputs = llm.generate(prompts, sampling_params)
-        sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
+    # Print the outputs.
-        outputs = llm.generate(prompts, sampling_params)
+    results = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        results.append({"prompt": prompt, "generated_text": generated_text})
-        # Print the outputs.
+    return {"results": results}
-        results = []
+```
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
            results.append({"prompt": prompt, "generated_text": generated_text})
        return {"results": results}
    ```
 Then, run the following code to deploy it to the cloud:
-```bash
+```console
 cerebrium deploy
 ```
 If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
-??? Command
+```python
-
+curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
-    ```python
+ -H 'Content-Type: application/json' \
-    curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
+ -H 'Authorization: <JWT TOKEN>' \
-    -H 'Content-Type: application/json' \
+ --data '{
-    -H 'Authorization: <JWT TOKEN>' \
+   "prompts": [
-    --data '{
+     "Hello, my name is",
-    "prompts": [
+     "The president of the United States is",
-        "Hello, my name is",
+     "The capital of France is",
-        "The president of the United States is",
+     "The future of AI is"
-        "The capital of France is",
+   ]
-        "The future of AI is"
+ }'
-    ]
+```
    }'
    ```
 You should get a response like:
-??? Response
+```python
-
+{
-    ```python
+    "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
-    {
+    "result": {
-        "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
+        "result": [
-        "result": {
+            {
-            "result": [
+                "prompt": "Hello, my name is",
-                {
+                "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
-                    "prompt": "Hello, my name is",
+            },
-                    "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
+            {
-                },
+                "prompt": "The president of the United States is",
-                {
+                "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
-                    "prompt": "The president of the United States is",
+            },
-                    "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
+            {
-                },
+                "prompt": "The capital of France is",
-                {
+                "generated_text": " Paris.\n"
-                    "prompt": "The capital of France is",
+            },
-                    "generated_text": " Paris.\n"
+            {
-                },
+                "prompt": "The future of AI is",
-                {
+                "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
-                    "prompt": "The future of AI is",
+            }
-                    "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
+        ]
-                }
+    },
-            ]
+    "run_time_ms": 152.53663063049316
-        },
+}
-        "run_time_ms": 152.53663063049316
+```
    }
    ```
 You now have an autoscaling endpoint where you only pay for the compute you use!
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@ -15,7 +15,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
 - Start the vLLM server with the supported chat completion model, e.g.
-```bash
+```console
 vllm serve qwen/Qwen1.5-0.5B-Chat
 ```
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@ -18,13 +18,13 @@ This guide walks you through deploying Dify using a vLLM backend.
 - Start the vLLM server with the supported chat completion model, e.g.
-```bash
+```console
 vllm serve Qwen/Qwen1.5-7B-Chat
 ```
 - Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):
-```bash
+```console
 git clone https://github.com/langgenius/dify.git
 cd dify
 cd docker
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@ -11,14 +11,14 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/),
 To install dstack client, run:
-```bash
+```console
 pip install "dstack[all]
 dstack server
 ```
 Next, to configure your dstack project, run:
-```bash
+```console
 mkdir -p vllm-dstack
 cd vllm-dstack
 dstack init
@ -26,81 +26,75 @@ dstack init
 Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
-??? Config
+```yaml
 type: service
-    ```yaml
+python: "3.11"
-    type: service
+env:
-
+    - MODEL=NousResearch/Llama-2-7b-chat-hf
-    python: "3.11"
+port: 8000
-    env:
+resources:
-        - MODEL=NousResearch/Llama-2-7b-chat-hf
+    gpu: 24GB
-    port: 8000
+commands:
-    resources:
+    - pip install vllm
-        gpu: 24GB
+    - vllm serve $MODEL --port 8000
-    commands:
+model:
-        - pip install vllm
+    format: openai
-        - vllm serve $MODEL --port 8000
+    type: chat
-    model:
+    name: NousResearch/Llama-2-7b-chat-hf
-        format: openai
+```
        type: chat
        name: NousResearch/Llama-2-7b-chat-hf
    ```
 Then, run the following CLI for provisioning:
-??? Command
+```console
 $ dstack run . -f serve.dstack.yml
-    ```console
+⠸ Getting run plan...
-    $ dstack run . -f serve.dstack.yml
+ Configuration  serve.dstack.yml
 Project        deep-diver-main
 User           deep-diver
 Min resources  2..xCPU, 8GB.., 1xGPU (24GB)
 Max price      -
 Max duration   -
 Spot policy    auto
 Retry policy   no
-    ⠸ Getting run plan...
+ #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE
-    Configuration  serve.dstack.yml
+ 1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
-    Project        deep-diver-main
+ 2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
-    User           deep-diver
+ 3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
-    Min resources  2..xCPU, 8GB.., 1xGPU (24GB)
+    ...
-    Max price      -
+ Shown 3 of 193 offers, $5.876 max
    Max duration   -
    Spot policy    auto
    Retry policy   no
-    #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE
+Continue? [y/n]: y
-    1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+⠙ Submitting run...
-    2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+⠏ Launching spicy-treefrog-1 (pulling)
-    3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+spicy-treefrog-1 provisioning completed (running)
-        ...
+Service is published at ...
-    Shown 3 of 193 offers, $5.876 max
+```
    Continue? [y/n]: y
    ⠙ Submitting run...
    ⠏ Launching spicy-treefrog-1 (pulling)
    spicy-treefrog-1 provisioning completed (running)
    Service is published at ...
    ```
 After the provisioning, you can interact with the model by using the OpenAI SDK:
-??? Code
+```python
 from openai import OpenAI
-    ```python
+client = OpenAI(
-    from openai import OpenAI
+    base_url="https://gateway.<gateway domain>",
    api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
 )
-    client = OpenAI(
+completion = client.chat.completions.create(
-        base_url="https://gateway.<gateway domain>",
+    model="NousResearch/Llama-2-7b-chat-hf",
-        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
+    messages=[
-    )
+        {
            "role": "user",
            "content": "Compose a poem that explains the concept of recursion in programming.",
        }
    ]
 )
-    completion = client.chat.completions.create(
+print(completion.choices[0].message.content)
-        model="NousResearch/Llama-2-7b-chat-hf",
+```
        messages=[
            {
                "role": "user",
                "content": "Compose a poem that explains the concept of recursion in programming.",
            }
        ]
    )
    print(completion.choices[0].message.content)
    ```
 !!! note
    dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@ -13,7 +13,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
 - Setup vLLM and Haystack environment
-```bash
+```console
 pip install vllm haystack-ai
 ```
@ -21,35 +21,35 @@ pip install vllm haystack-ai
 - Start the vLLM server with the supported chat completion model, e.g.
-```bash
+```console
 vllm serve mistralai/Mistral-7B-Instruct-v0.1
 ```
 - Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
-??? Code
+```python
 from haystack.components.generators.chat import OpenAIChatGenerator
 from haystack.dataclasses import ChatMessage
 from haystack.utils import Secret
-    ```python
+generator = OpenAIChatGenerator(
-    from haystack.components.generators.chat import OpenAIChatGenerator
+    # for compatibility with the OpenAI API, a placeholder api_key is needed
-    from haystack.dataclasses import ChatMessage
+    api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
-    from haystack.utils import Secret
+    model="mistralai/Mistral-7B-Instruct-v0.1",
    api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
    generation_kwargs = {"max_tokens": 512}
 )
-    generator = OpenAIChatGenerator(
+response = generator.run(
-        # for compatibility with the OpenAI API, a placeholder api_key is needed
+  messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
-        api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
+)
        model="mistralai/Mistral-7B-Instruct-v0.1",
        api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
        generation_kwargs = {"max_tokens": 512}
    )
-    response = generator.run(
+print("-"*30)
-      messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
+print(response)
-    )
+print("-"*30)
 ```
-    print("-"*30)
+Output e.g.:
    print(response)
    print("-"*30)
    ```
 ```console
 ------------------------------
--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@ -5,9 +5,9 @@ title: Helm
 A Helm chart to deploy vLLM for Kubernetes
-Helm is a package manager for Kubernetes. It helps automate the deployment of vLLM applications on Kubernetes. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
+Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
-This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for Helm installation and documentation on architecture and values file.
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
 ## Prerequisites
@ -16,27 +16,21 @@ Before you begin, ensure that you have the following:
 - A running Kubernetes cluster
 - NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
 - Available GPU resources in your cluster
- An S3 with the model which will be deployed
+- S3 with the model which will be deployed
 ## Installing the chart
 To install the chart with the release name `test-vllm`:
-```bash
+```console
-helm upgrade --install --create-namespace \
+helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
  --namespace=ns-vllm test-vllm . \
  -f values.yaml \
  --set secrets.s3endpoint=$ACCESS_POINT \
  --set secrets.s3bucketname=$BUCKET \
  --set secrets.s3accesskeyid=$ACCESS_KEY \
  --set secrets.s3accesskey=$SECRET_KEY
 ```
-## Uninstalling the chart
+## Uninstalling the Chart
 To uninstall the `test-vllm` deployment:
-```bash
+```console
 helm uninstall test-vllm --namespace=ns-vllm
 ```
@ -45,59 +39,57 @@ chart **including persistent volumes** and deletes the release.
 ## Architecture
-![helm deployment architecture](../../assets/deployment/architecture_helm_deployment.png)
+![](../../assets/deployment/architecture_helm_deployment.png)
 ## Values
-The following table describes configurable parameters of the chart in `values.yaml`:
+| Key                                        | Type    | Default                                                                                                                                                  | Description                                                                                                                               |
-
+|--------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------|
-| Key | Type | Default | Description |
+| autoscaling                                | object  | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}                                                                  | Autoscaling configuration                                                                                                                 |
-|-----|------|---------|-------------|
+| autoscaling.enabled                        | bool    | false                                                                                                                                                    | Enable autoscaling                                                                                                                        |
-| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
+| autoscaling.maxReplicas                    | int     | 100                                                                                                                                                      | Maximum replicas                                                                                                                          |
-| autoscaling.enabled | bool | false | Enable autoscaling |
+| autoscaling.minReplicas                    | int     | 1                                                                                                                                                        | Minimum replicas                                                                                                                          |
-| autoscaling.maxReplicas | int | 100 | Maximum replicas |
+| autoscaling.targetCPUUtilizationPercentage | int     | 80                                                                                                                                                       | Target CPU utilization for autoscaling                                                                                                    |
-| autoscaling.minReplicas | int | 1 | Minimum replicas |
+| configs                                    | object  | {}                                                                                                                                                       | Configmap                                                                                                                                 |
-| autoscaling.targetCPUUtilizationPercentage | int | 80 | Target CPU utilization for autoscaling |
+| containerPort                              | int     | 8000                                                                                                                                                     | Container port                                                                                                                            |
-| configs | object | {} | Configmap |
+| customObjects                              | list    | []                                                                                                                                                       | Custom Objects configuration                                                                                                              |
-| containerPort | int | 8000 | Container port |
+| deploymentStrategy                         | object  | {}                                                                                                                                                       | Deployment strategy configuration                                                                                                         |
-| customObjects | list | [] | Custom Objects configuration |
+| externalConfigs                            | list    | []                                                                                                                                                       | External configuration                                                                                                                    |
-| deploymentStrategy | object | {} | Deployment strategy configuration |
+| extraContainers                            | list    | []                                                                                                                                                       | Additional containers configuration                                                                                                       |
-| externalConfigs | list | [] | External configuration |
+| extraInit                                  | object  | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}                                                     | Additional configuration for the init container                                                                                           |
-| extraContainers | list | [] | Additional containers configuration |
+| extraInit.pvcStorage                       | string  | "50Gi"                                                                                                                                                   | Storage size of the s3                                                                                                                    |
-| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container |
+| extraInit.s3modelpath                      | string  | "relative_s3_model_path/opt-125m"                                                                                                                        | Path of the model on the s3 which hosts model weights and config files                                                                    |
-| extraInit.pvcStorage | string | "1Gi" | Storage size of the s3 |
+| extraInit.awsEc2MetadataDisabled           | boolean | true                                                                                                                                                     | Disables the use of the Amazon EC2 instance metadata service                                                                              |
-| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files |
+| extraPorts                                 | list    | []                                                                                                                                                       | Additional ports configuration                                                                                                            |
-| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service |
+| gpuModels                                  | list    | ["TYPE_GPU_USED"]                                                                                                                                        | Type of gpu used                                                                                                                          |
-| extraPorts | list | [] | Additional ports configuration |
+| image                                      | object  | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration                                                                                                                       |
-| gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used |
+| image.command                              | list    | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]                                                            | Container launch command                                                                                                                  |
-| image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration |
+| image.repository                           | string  | "vllm/vllm-openai"                                                                                                                                       | Image repository                                                                                                                          |
-| image.command | list | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] | Container launch command |
+| image.tag                                  | string  | "latest"                                                                                                                                                 | Image tag                                                                                                                                 |
-| image.repository | string | "vllm/vllm-openai" | Image repository |
+| livenessProbe                              | object  | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}                                              | Liveness probe configuration                                                                                                              |
-| image.tag | string | "latest" | Image tag |
+| livenessProbe.failureThreshold             | int     | 3                                                                                                                                                        | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
-| livenessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} | Liveness probe configuration |
+| livenessProbe.httpGet                      | object  | {"path":"/health","port":8000}                                                                                                                           | Configuration of the Kubelet http request on the server                                                                                   |
-| livenessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
+| livenessProbe.httpGet.path                 | string  | "/health"                                                                                                                                                | Path to access on the HTTP server                                                                                                         |
-| livenessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the kubelet http request on the server |
+| livenessProbe.httpGet.port                 | int     | 8000                                                                                                                                                     | Name or number of the port to access on the container, on which the server is listening                                                   |
-| livenessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
+| livenessProbe.initialDelaySeconds          | int     | 15                                                                                                                                                       | Number of seconds after the container has started before liveness probe is initiated                                                      |
-| livenessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
+| livenessProbe.periodSeconds                | int     | 10                                                                                                                                                       | How often (in seconds) to perform the liveness probe                                                                                      |
-| livenessProbe.initialDelaySeconds | int | 15 | Number of seconds after the container has started before liveness probe is initiated |
+| maxUnavailablePodDisruptionBudget          | string  | ""                                                                                                                                                       | Disruption Budget Configuration                                                                                                           |
-| livenessProbe.periodSeconds | int | 10 | How often (in seconds) to perform the liveness probe |
+| readinessProbe                             | object  | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}                                                | Readiness probe configuration                                                                                                             |
-| maxUnavailablePodDisruptionBudget | string | "" | Disruption Budget Configuration |
+| readinessProbe.failureThreshold            | int     | 3                                                                                                                                                        | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
-| readinessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} | Readiness probe configuration |
+| readinessProbe.httpGet                     | object  | {"path":"/health","port":8000}                                                                                                                           | Configuration of the Kubelet http request on the server                                                                                   |
-| readinessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
+| readinessProbe.httpGet.path                | string  | "/health"                                                                                                                                                | Path to access on the HTTP server                                                                                                         |
-| readinessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the kubelet http request on the server |
+| readinessProbe.httpGet.port                | int     | 8000                                                                                                                                                     | Name or number of the port to access on the container, on which the server is listening                                                   |
-| readinessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
+| readinessProbe.initialDelaySeconds         | int     | 5                                                                                                                                                        | Number of seconds after the container has started before readiness probe is initiated                                                     |
-| readinessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
+| readinessProbe.periodSeconds               | int     | 5                                                                                                                                                        | How often (in seconds) to perform the readiness probe                                                                                     |
-| readinessProbe.initialDelaySeconds | int | 5 | Number of seconds after the container has started before readiness probe is initiated |
+| replicaCount                               | int     | 1                                                                                                                                                        | Number of replicas                                                                                                                        |
-| readinessProbe.periodSeconds | int | 5 | How often (in seconds) to perform the readiness probe |
+| resources                                  | object  | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}                                          | Resource configuration                                                                                                                    |
-| replicaCount | int | 1 | Number of replicas |
+| resources.limits."nvidia.com/gpu"          | int     | 1                                                                                                                                                        | Number of gpus used                                                                                                                       |
-| resources | object | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} | Resource configuration |
+| resources.limits.cpu                       | int     | 4                                                                                                                                                        | Number of CPUs                                                                                                                            |
-| resources.limits."nvidia.com/gpu" | int | 1 | Number of GPUs used |
+| resources.limits.memory                    | string  | "16Gi"                                                                                                                                                   | CPU memory configuration                                                                                                                  |
-| resources.limits.cpu | int | 4 | Number of CPUs |
+| resources.requests."nvidia.com/gpu"        | int     | 1                                                                                                                                                        | Number of gpus used                                                                                                                       |
-| resources.limits.memory | string | "16Gi" | CPU memory configuration |
+| resources.requests.cpu                     | int     | 4                                                                                                                                                        | Number of CPUs                                                                                                                            |
-| resources.requests."nvidia.com/gpu" | int | 1 | Number of GPUs used |
+| resources.requests.memory                  | string  | "16Gi"                                                                                                                                                   | CPU memory configuration                                                                                                                  |
-| resources.requests.cpu | int | 4 | Number of CPUs |
+| secrets                                    | object  | {}                                                                                                                                                       | Secrets configuration                                                                                                                     |
-| resources.requests.memory | string | "16Gi" | CPU memory configuration |
+| serviceName                                | string  | Service name                                                                                                                                             |                                                                                                                                           |
-| secrets | object | {} | Secrets configuration |
+| servicePort                                | int     | 80                                                                                                                                                       | Service port                                                                                                                              |
-| serviceName | string | "" | Service name |
+| labels.environment                         | string  | test                                                                                                                                                     | Environment name                                                                                                                          |
 | servicePort | int | 80 | Service port |
 | labels.environment | string | test | Environment name |
--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@ -18,7 +18,7 @@ And LiteLLM supports all models on VLLM.
 - Setup vLLM and litellm environment
-```bash
+```console
 pip install vllm litellm
 ```
@ -28,35 +28,33 @@ pip install vllm litellm
 - Start the vLLM server with the supported chat completion model, e.g.
-```bash
+```console
 vllm serve qwen/Qwen1.5-0.5B-Chat
 ```
 - Call it with litellm:
-??? Code
+```python
 import litellm 
-    ```python
+messages = [{ "content": "Hello, how are you?","role": "user"}]
    import litellm 
-    messages = [{ "content": "Hello, how are you?","role": "user"}]
+# hosted_vllm is prefix key word and necessary
 response = litellm.completion(
            model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
            messages=messages,
            api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
            temperature=0.2,
            max_tokens=80)
-    # hosted_vllm is prefix key word and necessary
+print(response)
-    response = litellm.completion(
+```
                model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
                messages=messages,
                api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
                temperature=0.2,
                max_tokens=80)
    print(response)
    ```
 ### Embeddings
 - Start the vLLM server with the supported embedding model, e.g.
-```bash
+```console
 vllm serve BAAI/bge-base-en-v1.5
 ```
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@ -17,101 +17,99 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber
 Deploy the following yaml file `lws.yaml`
-??? Yaml
+```yaml
-
+apiVersion: leaderworkerset.x-k8s.io/v1
-    ```yaml
+kind: LeaderWorkerSet
-    apiVersion: leaderworkerset.x-k8s.io/v1
+metadata:
-    kind: LeaderWorkerSet
+  name: vllm
-    metadata:
+spec:
-      name: vllm
+  replicas: 2
-    spec:
+  leaderWorkerTemplate:
-      replicas: 2
+    size: 2
-      leaderWorkerTemplate:
+    restartPolicy: RecreateGroupOnPodRestart
-        size: 2
+    leaderTemplate:
-        restartPolicy: RecreateGroupOnPodRestart
+      metadata:
-        leaderTemplate:
+        labels:
-          metadata:
+          role: leader
-            labels:
+      spec:
-              role: leader
+        containers:
-          spec:
+          - name: vllm-leader
-            containers:
+            image: docker.io/vllm/vllm-openai:latest
-              - name: vllm-leader
+            env:
-                image: docker.io/vllm/vllm-openai:latest
+              - name: HUGGING_FACE_HUB_TOKEN
-                env:
+                value: <your-hf-token>
-                  - name: HUGGING_FACE_HUB_TOKEN
+            command:
-                    value: <your-hf-token>
+              - sh
-                command:
+              - -c
-                  - sh
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
-                  - -c
+                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
-                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+            resources:
-                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+              limits:
-                resources:
+                nvidia.com/gpu: "8"
-                  limits:
+                memory: 1124Gi
-                    nvidia.com/gpu: "8"
+                ephemeral-storage: 800Gi
-                    memory: 1124Gi
+              requests:
-                    ephemeral-storage: 800Gi
+                ephemeral-storage: 800Gi
-                  requests:
+                cpu: 125
-                    ephemeral-storage: 800Gi
+            ports:
-                    cpu: 125
+              - containerPort: 8080
-                ports:
+            readinessProbe:
-                  - containerPort: 8080
+              tcpSocket:
-                readinessProbe:
+                port: 8080
-                  tcpSocket:
+              initialDelaySeconds: 15
-                    port: 8080
+              periodSeconds: 10
-                  initialDelaySeconds: 15
+            volumeMounts:
-                  periodSeconds: 10
+              - mountPath: /dev/shm
-                volumeMounts:
+                name: dshm
-                  - mountPath: /dev/shm
+        volumes:
-                    name: dshm
+        - name: dshm
-            volumes:
+          emptyDir:
-            - name: dshm
+            medium: Memory
-              emptyDir:
+            sizeLimit: 15Gi
-                medium: Memory
+    workerTemplate:
-                sizeLimit: 15Gi
+      spec:
-        workerTemplate:
+        containers:
-          spec:
+          - name: vllm-worker
-            containers:
+            image: docker.io/vllm/vllm-openai:latest
-              - name: vllm-worker
+            command:
-                image: docker.io/vllm/vllm-openai:latest
+              - sh
-                command:
+              - -c
-                  - sh
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
-                  - -c
+            resources:
-                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+              limits:
-                resources:
+                nvidia.com/gpu: "8"
-                  limits:
+                memory: 1124Gi
-                    nvidia.com/gpu: "8"
+                ephemeral-storage: 800Gi
-                    memory: 1124Gi
+              requests:
-                    ephemeral-storage: 800Gi
+                ephemeral-storage: 800Gi
-                  requests:
+                cpu: 125
-                    ephemeral-storage: 800Gi
+            env:
-                    cpu: 125
+              - name: HUGGING_FACE_HUB_TOKEN
-                env:
+                value: <your-hf-token>
-                  - name: HUGGING_FACE_HUB_TOKEN
+            volumeMounts:
-                    value: <your-hf-token>
+              - mountPath: /dev/shm
-                volumeMounts:
+                name: dshm   
-                  - mountPath: /dev/shm
+        volumes:
-                    name: dshm   
+        - name: dshm
-            volumes:
+          emptyDir:
-            - name: dshm
+            medium: Memory
-              emptyDir:
+            sizeLimit: 15Gi
-                medium: Memory
+---
-                sizeLimit: 15Gi
+apiVersion: v1
-    ---
+kind: Service
-    apiVersion: v1
+metadata:
-    kind: Service
+  name: vllm-leader
-    metadata:
+spec:
-      name: vllm-leader
+  ports:
-    spec:
+    - name: http
-      ports:
+      port: 8080
-        - name: http
+      protocol: TCP
-          port: 8080
+      targetPort: 8080
-          protocol: TCP
+  selector:
-          targetPort: 8080
+    leaderworkerset.sigs.k8s.io/name: vllm
-      selector:
+    role: leader
-        leaderworkerset.sigs.k8s.io/name: vllm
+  type: ClusterIP
-        role: leader
+```
      type: ClusterIP
    ```
 ```bash
 kubectl apply -f lws.yaml
@ -177,27 +175,25 @@ curl http://localhost:8080/v1/completions \
 The output should be similar to the following
-??? Output
+```text
-
+{
-    ```text
+  "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
  "object": "text_completion",
  "created": 1715138766,
  "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
  "choices": [
    {
-      "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
+      "index": 0,
-      "object": "text_completion",
+      "text": " top destination for foodies, with",
-      "created": 1715138766,
+      "logprobs": null,
-      "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+      "finish_reason": "length",
-      "choices": [
+      "stop_reason": null
        {
          "index": 0,
          "text": " top destination for foodies, with",
          "logprobs": null,
          "finish_reason": "length",
          "stop_reason": null
        }
      ],
      "usage": {
        "prompt_tokens": 5,
        "total_tokens": 12,
        "completion_tokens": 7
      }
    }
-    ```
+  ],
  "usage": {
    "prompt_tokens": 5,
    "total_tokens": 12,
    "completion_tokens": 7
  }
 }
 ```
--- a/docs/deployment/frameworks/open-webui.md
+++ b/docs/deployment/frameworks/open-webui.md
@ -7,13 +7,13 @@ title: Open WebUI
 2. Start the vLLM server with the supported chat completion model, e.g.
-```bash
+```console
 vllm serve qwen/Qwen1.5-0.5B-Chat
 ```
 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
-```bash
+```console
 docker run -d -p 3000:8080 \
 --name open-webui \
 -v open-webui:/app/backend/data \
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@ -15,7 +15,7 @@ Here are the integrations:
 - Setup vLLM and langchain environment
-```bash
+```console
 pip install -U vllm \
            langchain_milvus langchain_openai \
            langchain_community beautifulsoup4 \
@ -26,14 +26,14 @@ pip install -U vllm \
 - Start the vLLM server with the supported embedding model, e.g.
-```bash
+```console
 # Start embedding service (port 8000)
 vllm serve ssmits/Qwen2-7B-Instruct-embed-base
 ```
 - Start the vLLM server with the supported chat completion model, e.g.
-```bash
+```console
 # Start chat service (port 8001)
 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
 ```
@ -52,7 +52,7 @@ python retrieval_augmented_generation_with_langchain.py
 - Setup vLLM and llamaindex environment
-```bash
+```console
 pip install vllm \
            llama-index llama-index-readers-web \
            llama-index-llms-openai-like    \
@ -64,14 +64,14 @@ pip install vllm \
 - Start the vLLM server with the supported embedding model, e.g.
-```bash
+```console
 # Start embedding service (port 8000)
 vllm serve ssmits/Qwen2-7B-Instruct-embed-base
 ```
 - Start the vLLM server with the supported chat completion model, e.g.
-```bash
+```console
 # Start chat service (port 8001)
 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
 ```
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@ -15,7 +15,7 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet
 - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
 - Check that `sky check` shows clouds or Kubernetes are enabled.
-```bash
+```console
 pip install skypilot-nightly
 sky check
 ```
@ -24,54 +24,52 @@ sky check
 See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
-??? Yaml
+```yaml
 resources:
  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
  use_spot: True
  disk_size: 512  # Ensure model checkpoints can fit.
  disk_tier: best
  ports: 8081  # Expose to internet traffic.
-    ```yaml
+envs:
-    resources:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
      use_spot: True
      disk_size: 512  # Ensure model checkpoints can fit.
      disk_tier: best
      ports: 8081  # Expose to internet traffic.
-    envs:
+setup: |
-      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  conda create -n vllm python=3.10 -y
-      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+  conda activate vllm
-    setup: |
+  pip install vllm==0.4.0.post1
-      conda create -n vllm python=3.10 -y
+  # Install Gradio for web UI.
-      conda activate vllm
+  pip install gradio openai
  pip install flash-attn==2.5.7
-      pip install vllm==0.4.0.post1
+run: |
-      # Install Gradio for web UI.
+  conda activate vllm
-      pip install gradio openai
+  echo 'Starting vllm api server...'
-      pip install flash-attn==2.5.7
+  python -u -m vllm.entrypoints.openai.api_server \
    --port 8081 \
    --model $MODEL_NAME \
    --trust-remote-code \
    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
    2>&1 | tee api_server.log &
-    run: |
+  echo 'Waiting for vllm api server to start...'
-      conda activate vllm
+  while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
      echo 'Starting vllm api server...'
      python -u -m vllm.entrypoints.openai.api_server \
        --port 8081 \
        --model $MODEL_NAME \
        --trust-remote-code \
        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
        2>&1 | tee api_server.log &
-      echo 'Waiting for vllm api server to start...'
+  echo 'Starting gradio server...'
-      while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+  git clone https://github.com/vllm-project/vllm.git || true
-
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-      echo 'Starting gradio server...'
+    -m $MODEL_NAME \
-      git clone https://github.com/vllm-project/vllm.git || true
+    --port 8811 \
-      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+    --model-url http://localhost:8081/v1 \
-        -m $MODEL_NAME \
+    --stop-token-ids 128009,128001
-        --port 8811 \
+```
        --model-url http://localhost:8081/v1 \
        --stop-token-ids 128009,128001
    ```
 Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
-```bash
+```console
 HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
 ```
@ -83,7 +81,7 @@ Check the output of the command. There will be a shareable gradio link (like the
 **Optional**: Serve the 70B model instead of the default 8B and use more GPU:
-```bash
+```console
 HF_TOKEN="your-huggingface-token" \
  sky launch serving.yaml \
  --gpus A100:8 \
@ -95,71 +93,72 @@ HF_TOKEN="your-huggingface-token" \
 SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
-??? Yaml
+```yaml
 service:
  replicas: 2
  # An actual request for readiness probe.
  readiness_probe:
    path: /v1/chat/completions
    post_data:
    model: $MODEL_NAME
    messages:
      - role: user
        content: Hello! What is your name?
  max_completion_tokens: 1
 ```
-    ```yaml
+<details>
-    service:
+<summary>Click to see the full recipe YAML</summary>
-      replicas: 2
+
-      # An actual request for readiness probe.
+```yaml
-      readiness_probe:
+service:
-        path: /v1/chat/completions
+  replicas: 2
-        post_data:
+  # An actual request for readiness probe.
-        model: $MODEL_NAME
+  readiness_probe:
-        messages:
+    path: /v1/chat/completions
-          - role: user
+    post_data:
-            content: Hello! What is your name?
+      model: $MODEL_NAME
      messages:
        - role: user
          content: Hello! What is your name?
      max_completion_tokens: 1
    ```
-??? Yaml
+resources:
  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
  use_spot: True
  disk_size: 512  # Ensure model checkpoints can fit.
  disk_tier: best
  ports: 8081  # Expose to internet traffic.
-    ```yaml
+envs:
-    service:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-      replicas: 2
+  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
      # An actual request for readiness probe.
      readiness_probe:
        path: /v1/chat/completions
        post_data:
          model: $MODEL_NAME
          messages:
            - role: user
              content: Hello! What is your name?
          max_completion_tokens: 1
-    resources:
+setup: |
-      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+  conda create -n vllm python=3.10 -y
-      use_spot: True
+  conda activate vllm
      disk_size: 512  # Ensure model checkpoints can fit.
      disk_tier: best
      ports: 8081  # Expose to internet traffic.
-    envs:
+  pip install vllm==0.4.0.post1
-      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  # Install Gradio for web UI.
-      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+  pip install gradio openai
  pip install flash-attn==2.5.7
-    setup: |
+run: |
-      conda create -n vllm python=3.10 -y
+  conda activate vllm
-      conda activate vllm
+  echo 'Starting vllm api server...'
  python -u -m vllm.entrypoints.openai.api_server \
    --port 8081 \
    --model $MODEL_NAME \
    --trust-remote-code \
    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
    2>&1 | tee api_server.log
 ```
-      pip install vllm==0.4.0.post1
+</details>
      # Install Gradio for web UI.
      pip install gradio openai
      pip install flash-attn==2.5.7
    run: |
      conda activate vllm
      echo 'Starting vllm api server...'
      python -u -m vllm.entrypoints.openai.api_server \
        --port 8081 \
        --model $MODEL_NAME \
        --trust-remote-code \
        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
        2>&1 | tee api_server.log
    ```
 Start the serving the Llama-3 8B model on multiple replicas:
-```bash
+```console
 HF_TOKEN="your-huggingface-token" \
  sky serve up -n vllm serving.yaml \
  --env HF_TOKEN
@ -167,11 +166,12 @@ HF_TOKEN="your-huggingface-token" \
 Wait until the service is ready:
-```bash
+```console
 watch -n10 sky serve status vllm
 ```
-Example outputs:
+<details>
 <summary>Example outputs:</summary>
 ```console
 Services
@ -184,29 +184,29 @@ vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  R
 vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 ```
 </details>
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
-??? Commands
+```console
-
+ENDPOINT=$(sky serve status --endpoint 8081 vllm)
-    ```bash
+curl -L http://$ENDPOINT/v1/chat/completions \
-    ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+  -H "Content-Type: application/json" \
-    curl -L http://$ENDPOINT/v1/chat/completions \
+  -d '{
-      -H "Content-Type: application/json" \
+    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-      -d '{
+    "messages": [
-        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+    {
-        "messages": [
+      "role": "system",
-        {
+      "content": "You are a helpful assistant."
-          "role": "system",
+    },
-          "content": "You are a helpful assistant."
+    {
-        },
+      "role": "user",
-        {
+      "content": "Who are you?"
-          "role": "user",
+    }
-          "content": "Who are you?"
+    ],
-        }
+    "stop_token_ids": [128009,  128001]
-        ],
+  }'
-        "stop_token_ids": [128009,  128001]
+```
      }'
    ```
 To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
@ -220,64 +220,67 @@ service:
 This will scale the service up to when the QPS exceeds 2 for each replica.
-??? Yaml
+<details>
 <summary>Click to see the full recipe YAML</summary>
-    ```yaml
+```yaml
-    service:
+service:
-      replica_policy:
+  replica_policy:
-        min_replicas: 2
+    min_replicas: 2
-        max_replicas: 4
+    max_replicas: 4
-        target_qps_per_replica: 2
+    target_qps_per_replica: 2
-      # An actual request for readiness probe.
+  # An actual request for readiness probe.
-      readiness_probe:
+  readiness_probe:
-        path: /v1/chat/completions
+    path: /v1/chat/completions
-        post_data:
+    post_data:
-          model: $MODEL_NAME
+      model: $MODEL_NAME
-          messages:
+      messages:
-            - role: user
+        - role: user
-              content: Hello! What is your name?
+          content: Hello! What is your name?
-          max_completion_tokens: 1
+      max_completion_tokens: 1
-    resources:
+resources:
-      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-      use_spot: True
+  use_spot: True
-      disk_size: 512  # Ensure model checkpoints can fit.
+  disk_size: 512  # Ensure model checkpoints can fit.
-      disk_tier: best
+  disk_tier: best
-      ports: 8081  # Expose to internet traffic.
+  ports: 8081  # Expose to internet traffic.
-    envs:
+envs:
-      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-    setup: |
+setup: |
-      conda create -n vllm python=3.10 -y
+  conda create -n vllm python=3.10 -y
-      conda activate vllm
+  conda activate vllm
-      pip install vllm==0.4.0.post1
+  pip install vllm==0.4.0.post1
-      # Install Gradio for web UI.
+  # Install Gradio for web UI.
-      pip install gradio openai
+  pip install gradio openai
-      pip install flash-attn==2.5.7
+  pip install flash-attn==2.5.7
-    run: |
+run: |
-      conda activate vllm
+  conda activate vllm
-      echo 'Starting vllm api server...'
+  echo 'Starting vllm api server...'
-      python -u -m vllm.entrypoints.openai.api_server \
+  python -u -m vllm.entrypoints.openai.api_server \
-        --port 8081 \
+    --port 8081 \
-        --model $MODEL_NAME \
+    --model $MODEL_NAME \
-        --trust-remote-code \
+    --trust-remote-code \
-        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-        2>&1 | tee api_server.log
+    2>&1 | tee api_server.log
-    ```
+```
 </details>
 To update the service with the new config:
-```bash
+```console
 HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
 ```
 To stop the service:
-```bash
+```console
 sky serve down vllm
 ```
@ -285,39 +288,42 @@ sky serve down vllm
 It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
-??? Yaml
+<details>
 <summary>Click to see the full GUI YAML</summary>
-    ```yaml
+```yaml
-    envs:
+envs:
-      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-      ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
+  ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
-    resources:
+resources:
-      cpus: 2
+  cpus: 2
-    setup: |
+setup: |
-      conda create -n vllm python=3.10 -y
+  conda create -n vllm python=3.10 -y
-      conda activate vllm
+  conda activate vllm
-      # Install Gradio for web UI.
+  # Install Gradio for web UI.
-      pip install gradio openai
+  pip install gradio openai
-    run: |
+run: |
-      conda activate vllm
+  conda activate vllm
-      export PATH=$PATH:/sbin
+  export PATH=$PATH:/sbin
-      echo 'Starting gradio server...'
+  echo 'Starting gradio server...'
-      git clone https://github.com/vllm-project/vllm.git || true
+  git clone https://github.com/vllm-project/vllm.git || true
-      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-        -m $MODEL_NAME \
+    -m $MODEL_NAME \
-        --port 8811 \
+    --port 8811 \
-        --model-url http://$ENDPOINT/v1 \
+    --model-url http://$ENDPOINT/v1 \
-        --stop-token-ids 128009,128001 | tee ~/gradio.log
+    --stop-token-ids 128009,128001 | tee ~/gradio.log
-    ```
+```
 </details>
 1. Start the chat web UI:
-    ```bash
+    ```console
    sky launch \
      -c gui ./gui.yaml \
      --env ENDPOINT=$(sky serve status --endpoint vllm)
--- a/docs/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@ -15,13 +15,13 @@ It can be quickly integrated with vLLM as a backend API server, enabling powerfu
 - Start the vLLM server with the supported chat completion model, e.g.
-```bash
+```console
 vllm serve qwen/Qwen1.5-0.5B-Chat
 ```
 - Install streamlit and openai:
-```bash
+```console
 pip install streamlit openai
 ```
@ -29,7 +29,7 @@ pip install streamlit openai
 - Start the streamlit web UI and start to chat:
-```bash
+```console
 streamlit run streamlit_openai_chatbot_webserver.py
 # or specify the VLLM_API_BASE or VLLM_API_KEY
--- a/docs/deployment/integrations/llamastack.md
+++ b/docs/deployment/integrations/llamastack.md
@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta
 To install Llama Stack, run
-```bash
+```console
 pip install llama-stack -q
 ```
--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@ -60,22 +60,22 @@ And then you can send out a query to the OpenAI-compatible API to check the avai
 curl -o- http://localhost:30080/models
 ```
-??? Output
+Expected output:
-    ```json
+```json
 {
  "object": "list",
  "data": [
    {
-      "object": "list",
+      "id": "facebook/opt-125m",
-      "data": [
+      "object": "model",
-        {
+      "created": 1737428424,
-          "id": "facebook/opt-125m",
+      "owned_by": "vllm",
-          "object": "model",
+      "root": null
          "created": 1737428424,
          "owned_by": "vllm",
          "root": null
        }
      ]
    }
-    ```
+  ]
 }
 ```
 To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
@ -89,23 +89,23 @@ curl -X POST http://localhost:30080/completions \
  }'
 ```
-??? Output
+Expected output:
-    ```json
+```json
 {
  "id": "completion-id",
  "object": "text_completion",
  "created": 1737428424,
  "model": "facebook/opt-125m",
  "choices": [
    {
-      "id": "completion-id",
+      "text": " there was a brave knight who...",
-      "object": "text_completion",
+      "index": 0,
-      "created": 1737428424,
+      "finish_reason": "length"
      "model": "facebook/opt-125m",
      "choices": [
        {
          "text": " there was a brave knight who...",
          "index": 0,
          "finish_reason": "length"
        }
      ]
    }
-    ```
+  ]
 }
 ```
 ### Uninstall
@ -121,25 +121,23 @@ sudo helm uninstall vllm
 The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
-??? Yaml
+```yaml
 servingEngineSpec:
  runtimeClassName: ""
  modelSpec:
  - name: "opt125m"
    repository: "vllm/vllm-openai"
    tag: "latest"
    modelURL: "facebook/opt-125m"
-    ```yaml
+    replicaCount: 1
    servingEngineSpec:
      runtimeClassName: ""
      modelSpec:
      - name: "opt125m"
        repository: "vllm/vllm-openai"
        tag: "latest"
        modelURL: "facebook/opt-125m"
-        replicaCount: 1
+    requestCPU: 6
    requestMemory: "16Gi"
    requestGPU: 1
-        requestCPU: 6
+    pvcStorage: "10Gi"
-        requestMemory: "16Gi"
+```
        requestGPU: 1
        pvcStorage: "10Gi"
    ```
 In this YAML configuration:
 * **`modelSpec`** includes:
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@ -29,93 +29,89 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
-??? Config
+```bash
-
+cat <<EOF |kubectl apply -f -
-    ```bash
+apiVersion: v1
-    cat <<EOF |kubectl apply -f -
+kind: PersistentVolumeClaim
-    apiVersion: v1
+metadata:
-    kind: PersistentVolumeClaim
+  name: vllm-models
-    metadata:
+spec:
-      name: vllm-models
+  accessModes:
-    spec:
+    - ReadWriteOnce
-      accessModes:
+  volumeMode: Filesystem
-        - ReadWriteOnce
+  resources:
-      volumeMode: Filesystem
+    requests:
-      resources:
+      storage: 50Gi
-        requests:
+---
-          storage: 50Gi
+apiVersion: v1
-    ---
+kind: Secret
-    apiVersion: v1
+metadata:
-    kind: Secret
+  name: hf-token-secret
-    metadata:
+type: Opaque
-      name: hf-token-secret
+data:
-    type: Opaque
+  token: $(HF_TOKEN)
-    data:
+EOF
-      token: $(HF_TOKEN)
+```
    EOF
    ```
 Next, start the vLLM server as a Kubernetes Deployment and Service:
-??? Config
+```bash
-
+cat <<EOF |kubectl apply -f -
-    ```bash
+apiVersion: apps/v1
-    cat <<EOF |kubectl apply -f -
+kind: Deployment
-    apiVersion: apps/v1
+metadata:
-    kind: Deployment
+  name: vllm-server
 spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: vllm
  template:
    metadata:
-      name: vllm-server
+      labels:
    spec:
      replicas: 1
      selector:
        matchLabels:
          app.kubernetes.io/name: vllm
      template:
        metadata:
          labels:
            app.kubernetes.io/name: vllm
        spec:
          containers:
          - name: vllm
            image: vllm/vllm-openai:latest
            command: ["/bin/sh", "-c"]
            args: [
              "vllm serve meta-llama/Llama-3.2-1B-Instruct"
            ]
            env:
            - name: HUGGING_FACE_HUB_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
                  key: token
            ports:
              - containerPort: 8000
            volumeMounts:
              - name: llama-storage
                mountPath: /root/.cache/huggingface
          volumes:
          - name: llama-storage
            persistentVolumeClaim:
              claimName: vllm-models
    ---
    apiVersion: v1
    kind: Service
    metadata:
      name: vllm-server
    spec:
      selector:
        app.kubernetes.io/name: vllm
-      ports:
+    spec:
-      - protocol: TCP
+      containers:
-        port: 8000
+      - name: vllm
-        targetPort: 8000
+        image: vllm/vllm-openai:latest
-      type: ClusterIP
+        command: ["/bin/sh", "-c"]
-    EOF
+        args: [
-    ```
+          "vllm serve meta-llama/Llama-3.2-1B-Instruct"
        ]
        env:
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
              name: hf-token-secret
              key: token
        ports:
          - containerPort: 8000
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
      volumes:
      - name: llama-storage
        persistentVolumeClaim:
          claimName: vllm-models
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: vllm-server
 spec:
  selector:
    app.kubernetes.io/name: vllm
  ports:
  - protocol: TCP
    port: 8000
    targetPort: 8000
  type: ClusterIP
 EOF
 ```
 We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
-```bash
+```console
 kubectl logs -l app.kubernetes.io/name=vllm
 ...
 INFO:     Started server process [1]
@ -132,9 +128,6 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
      PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
      <details>
      <summary>Yaml</summary>
      ```yaml
      apiVersion: v1
      kind: PersistentVolumeClaim
@ -151,8 +144,6 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
        volumeMode: Filesystem
      ```
      </details>
      Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
      ```yaml
@ -165,16 +156,13 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
      stringData:
        token: "REPLACE_WITH_TOKEN"
      ```
-  
+
      Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
      Here are two examples for using NVIDIA GPU and AMD GPU.
      NVIDIA GPU:
      <details>
      <summary>Yaml</summary>
      ```yaml
      apiVersion: apps/v1
      kind: Deployment
@ -245,15 +233,10 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                periodSeconds: 5
      ```
      </details>
      AMD GPU:
      You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
      <details>
      <summary>Yaml</summary>
      ```yaml
      apiVersion: apps/v1
      kind: Deployment
@ -322,17 +305,12 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                mountPath: /dev/shm
      ```
      </details>
      You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
 2. Create a Kubernetes Service for vLLM
      Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
      <details>
      <summary>Yaml</summary>
      ```yaml
      apiVersion: v1
      kind: Service
@ -352,20 +330,18 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
        type: ClusterIP
      ```
      </details>
 3. Deploy and Test
      Apply the deployment and service configurations using `kubectl apply -f <filename>`:
-      ```bash
+      ```console
      kubectl apply -f deployment.yaml
      kubectl apply -f service.yaml
      ```
      To test the deployment, run the following `curl` command:
-      ```bash
+      ```console
      curl http://mistral-7b.default.svc.cluster.local/v1/completions \
        -H "Content-Type: application/json" \
        -d '{
--- a/docs/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@ -11,13 +11,13 @@ This document shows how to launch multiple vLLM serving containers and use Nginx
 This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
-```bash
+```console
 export vllm_root=`pwd`
 ```
 Create a file named `Dockerfile.nginx`:
-```dockerfile
+```console
 FROM nginx:latest
 RUN rm /etc/nginx/conf.d/default.conf
 EXPOSE 80
@ -26,7 +26,7 @@ CMD ["nginx", "-g", "daemon off;"]
 Build the container:
-```bash
+```console
 docker build . -f Dockerfile.nginx --tag nginx-lb
 ```
@ -36,38 +36,36 @@ docker build . -f Dockerfile.nginx --tag nginx-lb
 Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
-??? Config
+```console
-
+upstream backend {
-    ```console
+    least_conn;
-    upstream backend {
+    server vllm0:8000 max_fails=3 fail_timeout=10000s;
-        least_conn;
+    server vllm1:8000 max_fails=3 fail_timeout=10000s;
-        server vllm0:8000 max_fails=3 fail_timeout=10000s;
+}
-        server vllm1:8000 max_fails=3 fail_timeout=10000s;
+server {
    listen 80;
    location / {
        proxy_pass http://backend;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
    }
-    server {
+}
-        listen 80;
+```
        location / {
            proxy_pass http://backend;
            proxy_set_header Host $host;
            proxy_set_header X-Real-IP $remote_addr;
            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
            proxy_set_header X-Forwarded-Proto $scheme;
        }
    }
    ```
 [](){ #nginxloadbalancer-nginx-vllm-container }
 ## Build vLLM Container
-```bash
+```console
 cd $vllm_root
 docker build -f docker/Dockerfile . --tag vllm
 ```
 If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
-```bash
+```console
 cd $vllm_root
 docker build \
    -f docker/Dockerfile . \
@ -80,7 +78,7 @@ docker build \
 ## Create Docker Network
-```bash
+```console
 docker network create vllm_nginx
 ```
@ -95,32 +93,30 @@ Notes:
 - The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
 - Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
-??? Commands
+```console
-
+mkdir -p ~/.cache/huggingface/hub/
-    ```console
+hf_cache_dir=~/.cache/huggingface/
-    mkdir -p ~/.cache/huggingface/hub/
+docker run \
-    hf_cache_dir=~/.cache/huggingface/
+    -itd \
-    docker run \
+    --ipc host \
-        -itd \
+    --network vllm_nginx \
-        --ipc host \
+    --gpus device=0 \
-        --network vllm_nginx \
+    --shm-size=10.24gb \
-        --gpus device=0 \
+    -v $hf_cache_dir:/root/.cache/huggingface/ \
-        --shm-size=10.24gb \
+    -p 8081:8000 \
-        -v $hf_cache_dir:/root/.cache/huggingface/ \
+    --name vllm0 vllm \
-        -p 8081:8000 \
+    --model meta-llama/Llama-2-7b-chat-hf
-        --name vllm0 vllm \
+docker run \
-        --model meta-llama/Llama-2-7b-chat-hf
+    -itd \
-    docker run \
+    --ipc host \
-        -itd \
+    --network vllm_nginx \
-        --ipc host \
+    --gpus device=1 \
-        --network vllm_nginx \
+    --shm-size=10.24gb \
-        --gpus device=1 \
+    -v $hf_cache_dir:/root/.cache/huggingface/ \
-        --shm-size=10.24gb \
+    -p 8082:8000 \
-        -v $hf_cache_dir:/root/.cache/huggingface/ \
+    --name vllm1 vllm \
-        -p 8082:8000 \
+    --model meta-llama/Llama-2-7b-chat-hf
-        --name vllm1 vllm \
+```
        --model meta-llama/Llama-2-7b-chat-hf
    ```
 !!! note
    If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
@ -129,7 +125,7 @@ Notes:
 ## Launch Nginx
-```bash
+```console
 docker run \
    -itd \
    -p 8000:80 \
@ -142,7 +138,7 @@ docker run \
 ## Verify That vLLM Servers Are Ready
-```bash
+```console
 docker logs vllm0 | grep Uvicorn
 docker logs vllm1 | grep Uvicorn
 ```
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@ -22,33 +22,31 @@ server.
 Here is a sample of `LLM` class usage:
-??? Code
+```python
 from vllm import LLM, SamplingParams
-    ```python
+# Define a list of input prompts
-    from vllm import LLM, SamplingParams
+prompts = [
    "Hello, my name is",
    "The capital of France is",
    "The largest ocean is",
 ]
-    # Define a list of input prompts
+# Define sampling parameters
-    prompts = [
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
        "Hello, my name is",
        "The capital of France is",
        "The largest ocean is",
    ]
-    # Define sampling parameters
+# Initialize the LLM engine with the OPT-125M model
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+llm = LLM(model="facebook/opt-125m")
-    # Initialize the LLM engine with the OPT-125M model
+# Generate outputs for the input prompts
-    llm = LLM(model="facebook/opt-125m")
+outputs = llm.generate(prompts, sampling_params)
-    # Generate outputs for the input prompts
+# Print the generated outputs
-    outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
-
+    prompt = output.prompt
-    # Print the generated outputs
+    generated_text = output.outputs[0].text
-    for output in outputs:
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        prompt = output.prompt
+```
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```
 More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
@ -180,34 +178,32 @@ vision-language model.
    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
-    ??? Code
+    ```python
    class MyOldModel(nn.Module):
        def __init__(
            self,
            config,
            cache_config: Optional[CacheConfig] = None,
            quant_config: Optional[QuantizationConfig] = None,
            lora_config: Optional[LoRAConfig] = None,
            prefix: str = "",
        ) -> None:
            ...
-        ```python
+    from vllm.config import VllmConfig
-        class MyOldModel(nn.Module):
+    class MyNewModel(MyOldModel):
-            def __init__(
+        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-                self,
+            config = vllm_config.model_config.hf_config
-                config,
+            cache_config = vllm_config.cache_config
-                cache_config: Optional[CacheConfig] = None,
+            quant_config = vllm_config.quant_config
-                quant_config: Optional[QuantizationConfig] = None,
+            lora_config = vllm_config.lora_config
-                lora_config: Optional[LoRAConfig] = None,
+            super().__init__(config, cache_config, quant_config, lora_config, prefix)
                prefix: str = "",
            ) -> None:
                ...
-        from vllm.config import VllmConfig
+    if __version__ >= "0.6.4":
-        class MyNewModel(MyOldModel):
+        MyModel = MyNewModel
-            def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    else:
-                config = vllm_config.model_config.hf_config
+        MyModel = MyOldModel
-                cache_config = vllm_config.cache_config
+    ```
                quant_config = vllm_config.quant_config
                lora_config = vllm_config.lora_config
                super().__init__(config, cache_config, quant_config, lora_config, prefix)
        if __version__ >= "0.6.4":
            MyModel = MyNewModel
        else:
            MyModel = MyOldModel
        ```
    This way, the model can work with both old and new versions of vLLM.
--- a/docs/design/kernel/paged_attention.md
+++ b/docs/design/kernel/paged_attention.md
@ -448,29 +448,27 @@ elements of the entire head for all context tokens. However, overall,
 all results for output have been calculated but are just stored in
 different thread register memory.
-??? Code
+```cpp
-
+float* out_smem = reinterpret_cast<float*>(shared_mem);
-    ```cpp
+for (int i = NUM_WARPS; i > 1; i /= 2) {
-    float* out_smem = reinterpret_cast<float*>(shared_mem);
+    // Upper warps write to shared memory.
-    for (int i = NUM_WARPS; i > 1; i /= 2) {
+    ...
-        // Upper warps write to shared memory.
+    float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
        ...
-        float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+        dst[row_idx] = accs[i];
        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
            ...
            dst[row_idx] = accs[i];
        }
        // Lower warps update the output.
        const float* src = &out_smem[warp_idx * HEAD_SIZE];
        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
            ...
            accs[i] += src[row_idx];
        }
        // Write out the accs.
    }
-    ```
+
    // Lower warps update the output.
    const float* src = &out_smem[warp_idx * HEAD_SIZE];
    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
        ...
        accs[i] += src[row_idx];
    }
    // Write out the accs.
 }
 ```
 ## Output
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@ -13,30 +13,28 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
 vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
-??? Code
+```python
 # inside `setup.py` file
 from setuptools import setup
-    ```python
+setup(name='vllm_add_dummy_model',
-    # inside `setup.py` file
+      version='0.1',
-    from setuptools import setup
+      packages=['vllm_add_dummy_model'],
      entry_points={
          'vllm.general_plugins':
          ["register_dummy_model = vllm_add_dummy_model:register"]
      })
-    setup(name='vllm_add_dummy_model',
+# inside `vllm_add_dummy_model.py` file
-        version='0.1',
+def register():
-        packages=['vllm_add_dummy_model'],
+    from vllm import ModelRegistry
        entry_points={
            'vllm.general_plugins':
            ["register_dummy_model = vllm_add_dummy_model:register"]
        })
-    # inside `vllm_add_dummy_model.py` file
+    if "MyLlava" not in ModelRegistry.get_supported_archs():
-    def register():
+        ModelRegistry.register_model(
-        from vllm import ModelRegistry
+            "MyLlava",
-
+            "vllm_add_dummy_model.my_llava:MyLlava",
-        if "MyLlava" not in ModelRegistry.get_supported_archs():
+        )
-            ModelRegistry.register_model(
+```
                "MyLlava",
                "vllm_add_dummy_model.my_llava:MyLlava",
            )
    ```
 For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/v1/p2p_nccl_connector.md
@ -61,25 +61,23 @@ To address the above issues, I have designed and developed a local Tensor memory
 # Install vLLM
-??? Commands
+```shell
 # Enter the home directory or your working directory.
 cd /home
-    ```shell
+# Download the installation package, and I will update the commit-id in time. You can directly copy the command.
-    # Enter the home directory or your working directory.
+wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
    cd /home
-    # Download the installation package, and I will update the commit-id in time. You can directly copy the command.
+# Download the code repository.
-    wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
 cd vllm
-    # Download the code repository.
+# Set the installation package path.
-    git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
+export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
    cd vllm
-    # Set the installation package path.
+# installation
-    export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+pip install -e . -v
-
+```
    # installation
    pip install -e . -v
    ```
 # Run xPyD
@ -106,91 +104,83 @@ python3 disagg_prefill_proxy_xpyd.py &
 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
-??? Command
+```shell
-
+VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
-    ```shell
+    --host 0.0.0.0 \
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+    --port 20005 \
-        --host 0.0.0.0 \
+    --tensor-parallel-size 1 \
-        --port 20005 \
+    --seed 1024 \
-        --tensor-parallel-size 1 \
+    --served-model-name base_model \
-        --seed 1024 \
+    --dtype float16 \
-        --served-model-name base_model \
+    --max-model-len 10000 \
-        --dtype float16 \
+    --max-num-batched-tokens 10000 \
-        --max-model-len 10000 \
+    --max-num-seqs 256 \
-        --max-num-batched-tokens 10000 \
+    --trust-remote-code \
-        --max-num-seqs 256 \
+    --gpu-memory-utilization 0.9 \
-        --trust-remote-code \
+    --disable-log-request \
-        --gpu-memory-utilization 0.9 \
+    --kv-transfer-config \
-        --disable-log-request \
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-        --kv-transfer-config \
+```
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
-??? Command
+```shell
-
+VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
-    ```shell
+    --host 0.0.0.0 \
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+    --port 20009 \
-        --host 0.0.0.0 \
+    --tensor-parallel-size 1 \
-        --port 20009 \
+    --seed 1024 \
-        --tensor-parallel-size 1 \
+    --served-model-name base_model \
-        --seed 1024 \
+    --dtype float16 \
-        --served-model-name base_model \
+    --max-model-len 10000 \
-        --dtype float16 \
+    --max-num-batched-tokens 10000 \
-        --max-model-len 10000 \
+    --max-num-seqs 256 \
-        --max-num-batched-tokens 10000 \
+    --trust-remote-code \
-        --max-num-seqs 256 \
+    --gpu-memory-utilization 0.7 \
-        --trust-remote-code \
+    --disable-log-request \
-        --gpu-memory-utilization 0.7 \
+    --kv-transfer-config \
-        --disable-log-request \
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-        --kv-transfer-config \
+```
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
-??? Command
+```shell
-
+VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
-    ```shell
+    --host 0.0.0.0 \
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+    --port 20003 \
-        --host 0.0.0.0 \
+    --tensor-parallel-size 1 \
-        --port 20003 \
+    --seed 1024 \
-        --tensor-parallel-size 1 \
+    --served-model-name base_model \
-        --seed 1024 \
+    --dtype float16 \
-        --served-model-name base_model \
+    --max-model-len 10000 \
-        --dtype float16 \
+    --max-num-batched-tokens 10000 \
-        --max-model-len 10000 \
+    --max-num-seqs 256 \
-        --max-num-batched-tokens 10000 \
+    --trust-remote-code \
-        --max-num-seqs 256 \
+    --gpu-memory-utilization 0.7 \
-        --trust-remote-code \
+    --disable-log-request \
-        --gpu-memory-utilization 0.7 \
+    --kv-transfer-config \
-        --disable-log-request \
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-        --kv-transfer-config \
+```
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
-??? Command
+```shell
-
+VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
-    ```shell
+    --host 0.0.0.0 \
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+    --port 20008 \
-        --host 0.0.0.0 \
+    --tensor-parallel-size 1 \
-        --port 20008 \
+    --seed 1024 \
-        --tensor-parallel-size 1 \
+    --served-model-name base_model \
-        --seed 1024 \
+    --dtype float16 \
-        --served-model-name base_model \
+    --max-model-len 10000 \
-        --dtype float16 \
+    --max-num-batched-tokens 10000 \
-        --max-model-len 10000 \
+    --max-num-seqs 256 \
-        --max-num-batched-tokens 10000 \
+    --trust-remote-code \
-        --max-num-seqs 256 \
+    --gpu-memory-utilization 0.7 \
-        --trust-remote-code \
+    --disable-log-request \
-        --gpu-memory-utilization 0.7 \
+    --kv-transfer-config \
-        --disable-log-request \
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-        --kv-transfer-config \
+```
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ## Run 3P1D
@ -203,91 +193,83 @@ python3 disagg_prefill_proxy_xpyd.py &
 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
-??? Command
+```shell
-
+VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
-    ```shell
+    --host 0.0.0.0 \
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+    --port 20005 \
-        --host 0.0.0.0 \
+    --tensor-parallel-size 1 \
-        --port 20005 \
+    --seed 1024 \
-        --tensor-parallel-size 1 \
+    --served-model-name base_model \
-        --seed 1024 \
+    --dtype float16 \
-        --served-model-name base_model \
+    --max-model-len 10000 \
-        --dtype float16 \
+    --max-num-batched-tokens 10000 \
-        --max-model-len 10000 \
+    --max-num-seqs 256 \
-        --max-num-batched-tokens 10000 \
+    --trust-remote-code \
-        --max-num-seqs 256 \
+    --gpu-memory-utilization 0.9 \
-        --trust-remote-code \
+    --disable-log-request \
-        --gpu-memory-utilization 0.9 \
+    --kv-transfer-config \
-        --disable-log-request \
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-        --kv-transfer-config \
+```
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
-??? Command
+```shell
-
+VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
-    ```shell
+    --host 0.0.0.0 \
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+    --port 20009 \
-        --host 0.0.0.0 \
+    --tensor-parallel-size 1 \
-        --port 20009 \
+    --seed 1024 \
-        --tensor-parallel-size 1 \
+    --served-model-name base_model \
-        --seed 1024 \
+    --dtype float16 \
-        --served-model-name base_model \
+    --max-model-len 10000 \
-        --dtype float16 \
+    --max-num-batched-tokens 10000 \
-        --max-model-len 10000 \
+    --max-num-seqs 256 \
-        --max-num-batched-tokens 10000 \
+    --trust-remote-code \
-        --max-num-seqs 256 \
+    --gpu-memory-utilization 0.9 \
-        --trust-remote-code \
+    --disable-log-request \
-        --gpu-memory-utilization 0.9 \
+    --kv-transfer-config \
-        --disable-log-request \
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-        --kv-transfer-config \
+```
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
-??? Command
+```shell
-
+VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
-    ```shell
+    --host 0.0.0.0 \
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+    --port 20003 \
-        --host 0.0.0.0 \
+    --tensor-parallel-size 1 \
-        --port 20003 \
+    --seed 1024 \
-        --tensor-parallel-size 1 \
+    --served-model-name base_model \
-        --seed 1024 \
+    --dtype float16 \
-        --served-model-name base_model \
+    --max-model-len 10000 \
-        --dtype float16 \
+    --max-num-batched-tokens 10000 \
-        --max-model-len 10000 \
+    --max-num-seqs 256 \
-        --max-num-batched-tokens 10000 \
+    --trust-remote-code \
-        --max-num-seqs 256 \
+    --gpu-memory-utilization 0.9 \
-        --trust-remote-code \
+    --disable-log-request \
-        --gpu-memory-utilization 0.9 \
+    --kv-transfer-config \
-        --disable-log-request \
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-        --kv-transfer-config \
+```
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
-??? Command
+```shell
-
+VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
-    ```shell
+    --host 0.0.0.0 \
-    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+    --port 20008 \
-        --host 0.0.0.0 \
+    --tensor-parallel-size 1 \
-        --port 20008 \
+    --seed 1024 \
-        --tensor-parallel-size 1 \
+    --served-model-name base_model \
-        --seed 1024 \
+    --dtype float16 \
-        --served-model-name base_model \
+    --max-model-len 10000 \
-        --dtype float16 \
+    --max-num-batched-tokens 10000 \
-        --max-model-len 10000 \
+    --max-num-seqs 256 \
-        --max-num-batched-tokens 10000 \
+    --trust-remote-code \
-        --max-num-seqs 256 \
+    --gpu-memory-utilization 0.7 \
-        --trust-remote-code \
+    --disable-log-request \
-        --gpu-memory-utilization 0.7 \
+    --kv-transfer-config \
-        --disable-log-request \
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
-        --kv-transfer-config \
+```
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 # Single request
@ -304,27 +286,25 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
 # Benchmark
-??? Command
+```shell
-
+python3 benchmark_serving.py \
-    ```shell
+    --backend vllm \
-    python3 benchmark_serving.py \
+    --model base_model \
-        --backend vllm \
+    --tokenizer meta-llama/Llama-3.1-8B-Instruct \
-        --model base_model \
+    --dataset-name "random" \
-        --tokenizer meta-llama/Llama-3.1-8B-Instruct \
+    --host 10.0.1.1 \
-        --dataset-name "random" \
+    --port 10001 \
-        --host 10.0.1.1 \
+    --random-input-len 1024 \
-        --port 10001 \
+    --random-output-len 1024 \
-        --random-input-len 1024 \
+    --ignore-eos \
-        --random-output-len 1024 \
+    --burstiness 100 \
-        --ignore-eos \
+    --percentile-metrics "ttft,tpot,itl,e2el" \
-        --burstiness 100 \
+    --metric-percentiles "90,95,99" \
-        --percentile-metrics "ttft,tpot,itl,e2el" \
+    --seed $(date +%s) \
-        --metric-percentiles "90,95,99" \
+    --trust-remote-code \
-        --seed $(date +%s) \
+    --request-rate 3 \
-        --trust-remote-code \
+    --num-prompts 1000
-        --request-rate 3 \
+```
        --num-prompts 1000
    ```
 # Shut down
--- a/docs/design/v1/torch_compile.md
+++ b/docs/design/v1/torch_compile.md
@ -28,29 +28,27 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
 In the very verbose logs, we can see:
-??? Logs
+```
 DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
-      ```text
+DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
-      DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
-      DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
+DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
-      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
+DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
-      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
+```
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
      DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
      DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
      ```
 This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation.
@ -101,31 +99,28 @@ This time, Inductor compilation is completely bypassed, and we will load from di
 The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
-```bash
+```
-vllm serve meta-llama/Llama-3.2-1B \
+vllm serve meta-llama/Llama-3.2-1B --compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
  --compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
 ```
 Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
 When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
-??? Logs
+```
-
+AUTOTUNE mm(8x2048, 2048x3072)
-    ```
+  triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
-    AUTOTUNE mm(8x2048, 2048x3072)
+  triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
-      triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+  triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
-      triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+  mm 0.0160 ms 81.6% 
-      triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
+  triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
-      mm 0.0160 ms 81.6% 
+  triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
-      triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
+  triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
-      triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+  triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
-      triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
+  triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
-      triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+  triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
-      triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
-      triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+```
    SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
    ```
 It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library).
@ -141,9 +136,8 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
 By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
-```bash
+```
-vllm serve meta-llama/Llama-3.2-1B \
+vllm serve meta-llama/Llama-3.2-1B --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
  --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
 ```
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@ -29,26 +29,24 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
 of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
 the third parameter is the path to the LoRA adapter.
-??? Code
+```python
 sampling_params = SamplingParams(
    temperature=0,
    max_tokens=256,
    stop=["[/assistant]"]
 )
-    ```python
+prompts = [
-    sampling_params = SamplingParams(
+     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-        temperature=0,
+     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-        max_tokens=256,
+]
        stop=["[/assistant]"]
    )
-    prompts = [
+outputs = llm.generate(
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
+    prompts,
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
+    sampling_params,
-    ]
+    lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
-
+)
-    outputs = llm.generate(
+```
        prompts,
        sampling_params,
        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
    )
    ```
 Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
@ -70,26 +68,24 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
 with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
-??? Command
+```bash
-
+curl localhost:8000/v1/models | jq .
-    ```bash
+{
-    curl localhost:8000/v1/models | jq .
+    "object": "list",
-    {
+    "data": [
-        "object": "list",
+        {
-        "data": [
+            "id": "meta-llama/Llama-2-7b-hf",
-            {
+            "object": "model",
-                "id": "meta-llama/Llama-2-7b-hf",
+            ...
-                "object": "model",
+        },
-                ...
+        {
-            },
+            "id": "sql-lora",
-            {
+            "object": "model",
-                "id": "sql-lora",
+            ...
-                "object": "model",
+        }
-                ...
+    ]
-            }
+}
-        ]
+```
    }
    ```
 Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
 processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
@ -172,36 +168,36 @@ Alternatively, follow these example steps to implement your own plugin:
 1. Implement the LoRAResolver interface.
-    ??? Example of a simple S3 LoRAResolver implementation
+    Example of a simple S3 LoRAResolver implementation:
-        ```python
+    ```python
-        import os
+    import os
-        import s3fs
+    import s3fs
-        from vllm.lora.request import LoRARequest
+    from vllm.lora.request import LoRARequest
-        from vllm.lora.resolver import LoRAResolver
+    from vllm.lora.resolver import LoRAResolver
-        class S3LoRAResolver(LoRAResolver):
+    class S3LoRAResolver(LoRAResolver):
-            def __init__(self):
+        def __init__(self):
-                self.s3 = s3fs.S3FileSystem()
+            self.s3 = s3fs.S3FileSystem()
-                self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
+            self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
-                self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
+            self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
-            async def resolve_lora(self, base_model_name, lora_name):
+        async def resolve_lora(self, base_model_name, lora_name):
-                s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+            s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
-                local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+            local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
-                # Download the LoRA from S3 to the local path
+            # Download the LoRA from S3 to the local path
-                await self.s3._get(
+            await self.s3._get(
-                    s3_path, local_path, recursive=True, maxdepth=1
+                s3_path, local_path, recursive=True, maxdepth=1
-                )
+            )
-                lora_request = LoRARequest(
+            lora_request = LoRARequest(
-                    lora_name=lora_name,
+                lora_name=lora_name,
-                    lora_path=local_path,
+                lora_path=local_path,
-                    lora_int_id=abs(hash(lora_name))
+                lora_int_id=abs(hash(lora_name))
-                )
+            )
-                return lora_request
+            return lora_request
-        ```
+    ```
 2. Register `LoRAResolver` plugin.
@ -238,40 +234,38 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
 - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
 - The `root` field points to the artifact location of the lora adapter.
-??? Command output
+```bash
 $ curl http://localhost:8000/v1/models
-    ```bash
+{
-    $ curl http://localhost:8000/v1/models
+    "object": "list",
-
+    "data": [
-    {
+        {
-        "object": "list",
+        "id": "meta-llama/Llama-2-7b-hf",
-        "data": [
+        "object": "model",
        "created": 1715644056,
        "owned_by": "vllm",
        "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
        "parent": null,
        "permission": [
            {
-            "id": "meta-llama/Llama-2-7b-hf",
+            .....
            "object": "model",
            "created": 1715644056,
            "owned_by": "vllm",
            "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
            "parent": null,
            "permission": [
                {
                .....
                }
            ]
            },
            {
            "id": "sql-lora",
            "object": "model",
            "created": 1715644056,
            "owned_by": "vllm",
            "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
            "parent": meta-llama/Llama-2-7b-hf,
            "permission": [
                {
                ....
                }
            ]
            }
        ]
-    }
+        },
-    ```
+        {
        "id": "sql-lora",
        "object": "model",
        "created": 1715644056,
        "owned_by": "vllm",
        "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
        "parent": meta-llama/Llama-2-7b-hf,
        "permission": [
            {
            ....
            }
        ]
        }
    ]
 }
 ```
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -20,117 +20,111 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
-??? Code
+```python
 from vllm import LLM
-    ```python
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
    from vllm import LLM
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+# Refer to the HuggingFace repo for the correct format to use
 prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
-    # Refer to the HuggingFace repo for the correct format to use
+# Load the image using PIL.Image
-    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+image = PIL.Image.open(...)
-    # Load the image using PIL.Image
+# Single prompt inference
-    image = PIL.Image.open(...)
+outputs = llm.generate({
    "prompt": prompt,
    "multi_modal_data": {"image": image},
 })
-    # Single prompt inference
+for o in outputs:
-    outputs = llm.generate({
+    generated_text = o.outputs[0].text
-        "prompt": prompt,
+    print(generated_text)
        "multi_modal_data": {"image": image},
    })
-    for o in outputs:
+# Batch inference
-        generated_text = o.outputs[0].text
+image_1 = PIL.Image.open(...)
-        print(generated_text)
+image_2 = PIL.Image.open(...)
 outputs = llm.generate(
    [
        {
            "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
            "multi_modal_data": {"image": image_1},
        },
        {
            "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
            "multi_modal_data": {"image": image_2},
        }
    ]
 )
-    # Batch inference
+for o in outputs:
-    image_1 = PIL.Image.open(...)
+    generated_text = o.outputs[0].text
-    image_2 = PIL.Image.open(...)
+    print(generated_text)
-    outputs = llm.generate(
+```
        [
            {
                "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
                "multi_modal_data": {"image": image_1},
            },
            {
                "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
                "multi_modal_data": {"image": image_2},
            }
        ]
    )
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
    ```
 Full example: <gh-file:examples/offline_inference/vision_language.py>
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
-??? Code
+```python
 from vllm import LLM
-    ```python
+llm = LLM(
-    from vllm import LLM
+    model="microsoft/Phi-3.5-vision-instruct",
    trust_remote_code=True,  # Required to load Phi-3.5-vision
    max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
    limit_mm_per_prompt={"image": 2},  # The maximum number to accept
 )
-    llm = LLM(
+# Refer to the HuggingFace repo for the correct format to use
-        model="microsoft/Phi-3.5-vision-instruct",
+prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
        trust_remote_code=True,  # Required to load Phi-3.5-vision
        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
    )
-    # Refer to the HuggingFace repo for the correct format to use
+# Load the images using PIL.Image
-    prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
+image1 = PIL.Image.open(...)
 image2 = PIL.Image.open(...)
-    # Load the images using PIL.Image
+outputs = llm.generate({
-    image1 = PIL.Image.open(...)
+    "prompt": prompt,
-    image2 = PIL.Image.open(...)
+    "multi_modal_data": {
        "image": [image1, image2]
    },
 })
-    outputs = llm.generate({
+for o in outputs:
-        "prompt": prompt,
+    generated_text = o.outputs[0].text
-        "multi_modal_data": {
+    print(generated_text)
-            "image": [image1, image2]
+```
        },
    })
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
    ```
 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
-??? Code
+```python
 from vllm import LLM
-    ```python
+# Specify the maximum number of frames per video to be 4. This can be changed.
-    from vllm import LLM
+llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-    # Specify the maximum number of frames per video to be 4. This can be changed.
+# Create the request payload.
-    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+video_frames = ... # load your video making sure it only has the number of frames specified earlier.
 message = {
    "role": "user",
    "content": [
        {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
    ],
 }
 for i in range(len(video_frames)):
    base64_image = encode_image(video_frames[i]) # base64 encoding.
    new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
    message["content"].append(new_image)
-    # Create the request payload.
+# Perform inference and log output.
-    video_frames = ... # load your video making sure it only has the number of frames specified earlier.
+outputs = llm.chat([message])
    message = {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
        ],
    }
    for i in range(len(video_frames)):
        base64_image = encode_image(video_frames[i]) # base64 encoding.
        new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
        message["content"].append(new_image)
-    # Perform inference and log output.
+for o in outputs:
-    outputs = llm.chat([message])
+    generated_text = o.outputs[0].text
-
+    print(generated_text)
-    for o in outputs:
+```
        generated_text = o.outputs[0].text
        print(generated_text)
    ```
 ### Video Inputs
@ -150,72 +144,68 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
-??? Code
+```python
 from vllm import LLM
-    ```python
+# Inference with image embeddings as input
-    from vllm import LLM
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-    # Inference with image embeddings as input
+# Refer to the HuggingFace repo for the correct format to use
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
-    # Refer to the HuggingFace repo for the correct format to use
+# Embeddings for single image
-    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
 image_embeds = torch.load(...)
-    # Embeddings for single image
+outputs = llm.generate({
-    # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+    "prompt": prompt,
-    image_embeds = torch.load(...)
+    "multi_modal_data": {"image": image_embeds},
 })
-    outputs = llm.generate({
+for o in outputs:
-        "prompt": prompt,
+    generated_text = o.outputs[0].text
-        "multi_modal_data": {"image": image_embeds},
+    print(generated_text)
-    })
+```
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
    ```
 For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
-??? Code
+```python
 # Construct the prompt based on your model
 prompt = ...
-    ```python
+# Embeddings for multiple images
-    # Construct the prompt based on your model
+# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
-    prompt = ...
+image_embeds = torch.load(...)
-    # Embeddings for multiple images
+# Qwen2-VL
-    # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
+llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-    image_embeds = torch.load(...)
+mm_data = {
-
+    "image": {
-    # Qwen2-VL
+        "image_embeds": image_embeds,
-    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+        # image_grid_thw is needed to calculate positional encoding.
-    mm_data = {
+        "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
        "image": {
            "image_embeds": image_embeds,
            # image_grid_thw is needed to calculate positional encoding.
            "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
        }
    }
 }
-    # MiniCPM-V
+# MiniCPM-V
-    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
-    mm_data = {
+mm_data = {
-        "image": {
+    "image": {
-            "image_embeds": image_embeds,
+        "image_embeds": image_embeds,
-            # image_sizes is needed to calculate details of the sliced image.
+        # image_sizes is needed to calculate details of the sliced image.
-            "image_sizes": [image.size for image in images],  # list of image sizes
+        "image_sizes": [image.size for image in images],  # list of image sizes
        }
    }
 }
-    outputs = llm.generate({
+outputs = llm.generate({
-        "prompt": prompt,
+    "prompt": prompt,
-        "multi_modal_data": mm_data,
+    "multi_modal_data": mm_data,
-    })
+})
-    for o in outputs:
+for o in outputs:
-        generated_text = o.outputs[0].text
+    generated_text = o.outputs[0].text
-        print(generated_text)
+    print(generated_text)
-    ```
+```
 ## Online Serving
@ -245,53 +235,51 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
 Then, you can use the OpenAI client as follows:
-??? Code
+```python
 from openai import OpenAI
-    ```python
+openai_api_key = "EMPTY"
-    from openai import OpenAI
+openai_api_base = "http://localhost:8000/v1"
-    openai_api_key = "EMPTY"
+client = OpenAI(
-    openai_api_base = "http://localhost:8000/v1"
+    api_key=openai_api_key,
    base_url=openai_api_base,
 )
-    client = OpenAI(
+# Single-image input inference
-        api_key=openai_api_key,
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
        base_url=openai_api_base,
    )
-    # Single-image input inference
+chat_response = client.chat.completions.create(
-    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    model="microsoft/Phi-3.5-vision-instruct",
    messages=[{
        "role": "user",
        "content": [
            # NOTE: The prompt formatting with the image token `<image>` is not needed
            # since the prompt will be processed automatically by the API server.
            {"type": "text", "text": "What’s in this image?"},
            {"type": "image_url", "image_url": {"url": image_url}},
        ],
    }],
 )
 print("Chat completion output:", chat_response.choices[0].message.content)
-    chat_response = client.chat.completions.create(
+# Multi-image input inference
-        model="microsoft/Phi-3.5-vision-instruct",
+image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-        messages=[{
+image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
            "role": "user",
            "content": [
                # NOTE: The prompt formatting with the image token `<image>` is not needed
                # since the prompt will be processed automatically by the API server.
                {"type": "text", "text": "What’s in this image?"},
                {"type": "image_url", "image_url": {"url": image_url}},
            ],
        }],
    )
    print("Chat completion output:", chat_response.choices[0].message.content)
-    # Multi-image input inference
+chat_response = client.chat.completions.create(
-    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+    model="microsoft/Phi-3.5-vision-instruct",
-    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    messages=[{
-
+        "role": "user",
-    chat_response = client.chat.completions.create(
+        "content": [
-        model="microsoft/Phi-3.5-vision-instruct",
+            {"type": "text", "text": "What are the animals in these images?"},
-        messages=[{
+            {"type": "image_url", "image_url": {"url": image_url_duck}},
-            "role": "user",
+            {"type": "image_url", "image_url": {"url": image_url_lion}},
-            "content": [
+        ],
-                {"type": "text", "text": "What are the animals in these images?"},
+    }],
-                {"type": "image_url", "image_url": {"url": image_url_duck}},
+)
-                {"type": "image_url", "image_url": {"url": image_url_lion}},
+print("Chat completion output:", chat_response.choices[0].message.content)
-            ],
+```
        }],
    )
    print("Chat completion output:", chat_response.choices[0].message.content)
    ```
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@ -307,7 +295,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
    By default, the timeout for fetching images through HTTP URL is `5` seconds.
    You can override this by setting the environment variable:
-    ```bash
+    ```console
    export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
    ```
@ -323,46 +311,44 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
 Then, you can use the OpenAI client as follows:
-??? Code
+```python
 from openai import OpenAI
-    ```python
+openai_api_key = "EMPTY"
-    from openai import OpenAI
+openai_api_base = "http://localhost:8000/v1"
-    openai_api_key = "EMPTY"
+client = OpenAI(
-    openai_api_base = "http://localhost:8000/v1"
+    api_key=openai_api_key,
    base_url=openai_api_base,
 )
-    client = OpenAI(
+video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
        api_key=openai_api_key,
        base_url=openai_api_base,
    )
-    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+## Use video url in the payload
-
+chat_completion_from_url = client.chat.completions.create(
-    ## Use video url in the payload
+    messages=[{
-    chat_completion_from_url = client.chat.completions.create(
+        "role":
-        messages=[{
+        "user",
-            "role":
+        "content": [
-            "user",
+            {
-            "content": [
+                "type": "text",
-                {
+                "text": "What's in this video?"
-                    "type": "text",
+            },
-                    "text": "What's in this video?"
+            {
                "type": "video_url",
                "video_url": {
                    "url": video_url
                },
-                {
+            },
-                    "type": "video_url",
+        ],
-                    "video_url": {
+    }],
-                        "url": video_url
+    model=model,
-                    },
+    max_completion_tokens=64,
-                },
+)
            ],
        }],
        model=model,
        max_completion_tokens=64,
    )
-    result = chat_completion_from_url.choices[0].message.content
+result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from image url:", result)
+print("Chat completion output from image url:", result)
-    ```
+```
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@ -370,7 +356,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
    By default, the timeout for fetching videos through HTTP URL is `30` seconds.
    You can override this by setting the environment variable:
-    ```bash
+    ```console
    export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
    ```
@ -387,88 +373,84 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
 Then, you can use the OpenAI client as follows:
-??? Code
+```python
 import base64
 import requests
 from openai import OpenAI
 from vllm.assets.audio import AudioAsset
-    ```python
+def encode_base64_content_from_url(content_url: str) -> str:
-    import base64
+    """Encode a content retrieved from a remote url to base64 format."""
    import requests
    from openai import OpenAI
    from vllm.assets.audio import AudioAsset
-    def encode_base64_content_from_url(content_url: str) -> str:
+    with requests.get(content_url) as response:
-        """Encode a content retrieved from a remote url to base64 format."""
+        response.raise_for_status()
        result = base64.b64encode(response.content).decode('utf-8')
-        with requests.get(content_url) as response:
+    return result
            response.raise_for_status()
            result = base64.b64encode(response.content).decode('utf-8')
-        return result
+openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-    openai_api_key = "EMPTY"
+client = OpenAI(
-    openai_api_base = "http://localhost:8000/v1"
+    api_key=openai_api_key,
    base_url=openai_api_base,
 )
-    client = OpenAI(
+# Any format supported by librosa is supported
-        api_key=openai_api_key,
+audio_url = AudioAsset("winning_call").url
-        base_url=openai_api_base,
+audio_base64 = encode_base64_content_from_url(audio_url)
    )
-    # Any format supported by librosa is supported
+chat_completion_from_base64 = client.chat.completions.create(
-    audio_url = AudioAsset("winning_call").url
+    messages=[{
-    audio_base64 = encode_base64_content_from_url(audio_url)
+        "role": "user",
-
+        "content": [
-    chat_completion_from_base64 = client.chat.completions.create(
+            {
-        messages=[{
+                "type": "text",
-            "role": "user",
+                "text": "What's in this audio?"
-            "content": [
+            },
-                {
+            {
-                    "type": "text",
+                "type": "input_audio",
-                    "text": "What's in this audio?"
+                "input_audio": {
                    "data": audio_base64,
                    "format": "wav"
                },
-                {
+            },
-                    "type": "input_audio",
+        ],
-                    "input_audio": {
+    }],
-                        "data": audio_base64,
+    model=model,
-                        "format": "wav"
+    max_completion_tokens=64,
-                    },
+)
                },
            ],
        }],
        model=model,
        max_completion_tokens=64,
    )
-    result = chat_completion_from_base64.choices[0].message.content
+result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from input audio:", result)
+print("Chat completion output from input audio:", result)
-    ```
+```
 Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
-??? Code
+```python
-
+chat_completion_from_url = client.chat.completions.create(
-    ```python
+    messages=[{
-    chat_completion_from_url = client.chat.completions.create(
+        "role": "user",
-        messages=[{
+        "content": [
-            "role": "user",
+            {
-            "content": [
+                "type": "text",
-                {
+                "text": "What's in this audio?"
-                    "type": "text",
+            },
-                    "text": "What's in this audio?"
+            {
                "type": "audio_url",
                "audio_url": {
                    "url": audio_url
                },
-                {
+            },
-                    "type": "audio_url",
+        ],
-                    "audio_url": {
+    }],
-                        "url": audio_url
+    model=model,
-                    },
+    max_completion_tokens=64,
-                },
+)
            ],
        }],
        model=model,
        max_completion_tokens=64,
    )
-    result = chat_completion_from_url.choices[0].message.content
+result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from audio url:", result)
+print("Chat completion output from audio url:", result)
-    ```
+```
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@ -476,7 +458,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
    By default, the timeout for fetching audios through HTTP URL is `10` seconds.
    You can override this by setting the environment variable:
-    ```bash
+    ```console
    export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
    ```
@ -488,63 +470,61 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
 For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
 The following example demonstrates how to pass image embeddings to the OpenAI server:
-??? Code
+```python
 image_embedding = torch.load(...)
 grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
-    ```python
+buffer = io.BytesIO()
-    image_embedding = torch.load(...)
+torch.save(image_embedding, buffer)
-    grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
+buffer.seek(0)
 binary_data = buffer.read()
 base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
-    buffer = io.BytesIO()
+client = OpenAI(
-    torch.save(image_embedding, buffer)
+    # defaults to os.environ.get("OPENAI_API_KEY")
-    buffer.seek(0)
+    api_key=openai_api_key,
-    binary_data = buffer.read()
+    base_url=openai_api_base,
-    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+)
-    client = OpenAI(
+# Basic usage - this is equivalent to the LLaVA example for offline inference
-        # defaults to os.environ.get("OPENAI_API_KEY")
+model = "llava-hf/llava-1.5-7b-hf"
-        api_key=openai_api_key,
+embeds =  {
-        base_url=openai_api_base,
+    "type": "image_embeds",
-    )
+    "image_embeds": f"{base64_image_embedding}" 
 }
-    # Basic usage - this is equivalent to the LLaVA example for offline inference
+# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
-    model = "llava-hf/llava-1.5-7b-hf"
+model = "Qwen/Qwen2-VL-2B-Instruct"
-    embeds =  {
+embeds =  {
-        "type": "image_embeds",
+    "type": "image_embeds",
-        "image_embeds": f"{base64_image_embedding}" 
+    "image_embeds": {
-    }
+        "image_embeds": f"{base64_image_embedding}" , # Required
-
+        "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
-    # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
+    },
-    model = "Qwen/Qwen2-VL-2B-Instruct"
+}
-    embeds =  {
+model = "openbmb/MiniCPM-V-2_6"
-        "type": "image_embeds",
+embeds =  {
-        "image_embeds": {
+    "type": "image_embeds",
-            "image_embeds": f"{base64_image_embedding}" , # Required
+    "image_embeds": {
-            "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
+        "image_embeds": f"{base64_image_embedding}" , # Required
        "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
    },
 }
 chat_completion = client.chat.completions.create(
    messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": [
        {
            "type": "text",
            "text": "What's in this image?",
        },
-    }
+        embeds,
-    model = "openbmb/MiniCPM-V-2_6"
+        ],
-    embeds =  {
+    },
-        "type": "image_embeds",
+],
-        "image_embeds": {
+    model=model,
-            "image_embeds": f"{base64_image_embedding}" , # Required
+)
-            "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
+```
        },
    }
    chat_completion = client.chat.completions.create(
        messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            {
                "type": "text",
                "text": "What's in this image?",
            },
            embeds,
            ],
        },
    ],
        model=model,
    )
    ```
 !!! note
    Only one message can contain `{"type": "image_embeds"}`.
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@ -9,41 +9,39 @@ The main benefits are lower latency and memory usage.
 You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq).
-```bash
+```console
 pip install autoawq
 ```
 After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
-??? Code
+```python
 from awq import AutoAWQForCausalLM
 from transformers import AutoTokenizer
-    ```python
+model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-    from awq import AutoAWQForCausalLM
+quant_path = 'mistral-instruct-v0.2-awq'
-    from transformers import AutoTokenizer
+quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
-    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+# Load model
-    quant_path = 'mistral-instruct-v0.2-awq'
+model = AutoAWQForCausalLM.from_pretrained(
-    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
 )
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    # Load model
+# Quantize
-    model = AutoAWQForCausalLM.from_pretrained(
+model.quantize(tokenizer, quant_config=quant_config)
        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    # Quantize
+# Save quantized model
-    model.quantize(tokenizer, quant_config=quant_config)
+model.save_quantized(quant_path)
 tokenizer.save_pretrained(quant_path)
-    # Save quantized model
+print(f'Model is quantized and saved at "{quant_path}"')
-    model.save_quantized(quant_path)
+```
    tokenizer.save_pretrained(quant_path)
    print(f'Model is quantized and saved at "{quant_path}"')
    ```
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
-```bash
+```console
 python examples/offline_inference/llm_engine_example.py \
    --model TheBloke/Llama-2-7b-Chat-AWQ \
    --quantization awq
@ -51,29 +49,27 @@ python examples/offline_inference/llm_engine_example.py \
 AWQ models are also supported directly through the LLM entrypoint:
-??? Code
+```python
 from vllm import LLM, SamplingParams
-    ```python
+# Sample prompts.
-    from vllm import LLM, SamplingParams
+prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    # Sample prompts.
+# Create an LLM.
-    prompts = [
+llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-        "Hello, my name is",
+# Generate texts from the prompts. The output is a list of RequestOutput objects
-        "The president of the United States is",
+# that contain the prompt, generated text, and other information.
-        "The capital of France is",
+outputs = llm.generate(prompts, sampling_params)
-        "The future of AI is",
+# Print the outputs.
-    ]
+for output in outputs:
-    # Create a sampling params object.
+    prompt = output.prompt
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    generated_text = output.outputs[0].text
-
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    # Create an LLM.
+```
    llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@ -12,7 +12,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic
 Below are the steps to utilize BitBLAS with vLLM.
-```bash
+```console
 pip install bitblas>=0.1.0
 ```
@ -43,19 +43,17 @@ llm = LLM(
 ## Read gptq format checkpoint
-??? Code
+```python
 from vllm import LLM
 import torch
-    ```python
+# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
-    from vllm import LLM
+model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-    import torch
+llm = LLM(
-
+    model=model_id,
-    # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+    dtype=torch.float16,
-    model_id = "hxbgsyxh/llama-13b-4bit-g-1"
+    trust_remote_code=True,
-    llm = LLM(
+    quantization="bitblas",
-        model=model_id,
+    max_model_len=1024
-        dtype=torch.float16,
+)
-        trust_remote_code=True,
+```
        quantization="bitblas",
        max_model_len=1024
    )
    ```
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
 Below are the steps to utilize BitsAndBytes with vLLM.
-```bash
+```console
 pip install bitsandbytes>=0.45.3
 ```
@ -54,6 +54,6 @@ llm = LLM(
 Append the following to your model arguments for 4bit inflight quantization:
-```bash
+```console
 --quantization bitsandbytes
 ```
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@ -23,7 +23,7 @@ The FP8 types typically supported in hardware have two distinct representations,
 To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
-```bash
+```console
 pip install llmcompressor
 ```
@ -58,30 +58,28 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
 Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
-??? Code
+```python
 from llmcompressor.transformers import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-    ```python
+# Configure the simple PTQ quantization
-    from llmcompressor.transformers import oneshot
+recipe = QuantizationModifier(
-    from llmcompressor.modifiers.quantization import QuantizationModifier
+  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
-    # Configure the simple PTQ quantization
+# Apply the quantization algorithm.
-    recipe = QuantizationModifier(
+oneshot(model=model, recipe=recipe)
      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
-    # Apply the quantization algorithm.
+# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
-    oneshot(model=model, recipe=recipe)
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-
+model.save_pretrained(SAVE_DIR)
-    # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
+tokenizer.save_pretrained(SAVE_DIR)
-    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+```
    model.save_pretrained(SAVE_DIR)
    tokenizer.save_pretrained(SAVE_DIR)
    ```
 ### 3. Evaluating Accuracy
 Install `vllm` and `lm-evaluation-harness` for evaluation:
-```bash
+```console
 pip install vllm lm-eval==0.4.4
 ```
@ -99,9 +97,9 @@ Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
 !!! note
    Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
-```bash
+```console
-MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
+$ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
-lm_eval \
+$ lm_eval \
  --model vllm \
  --model_args pretrained=$MODEL,add_bos_token=True \
  --tasks gsm8k  --num_fewshot 5 --batch_size auto --limit 250
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@ -11,7 +11,7 @@ title: GGUF
 To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
-```bash
+```console
 wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
 vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
@ -20,7 +20,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
 You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
-```bash
+```console
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
 vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
@ -32,7 +32,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
 GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
-```bash
+```console
 # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
 vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
@ -41,44 +41,42 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
 You can also use the GGUF model directly through the LLM entrypoint:
-??? Code
+```python
 from vllm import LLM, SamplingParams
-      ```python
+# In this script, we demonstrate how to pass input to the chat method:
-      from vllm import LLM, SamplingParams
+conversation = [
   {
      "role": "system",
      "content": "You are a helpful assistant"
   },
   {
      "role": "user",
      "content": "Hello"
   },
   {
      "role": "assistant",
      "content": "Hello! How can I assist you today?"
   },
   {
      "role": "user",
      "content": "Write an essay about the importance of higher education.",
   },
 ]
-      # In this script, we demonstrate how to pass input to the chat method:
+# Create a sampling params object.
-      conversation = [
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
         {
            "role": "system",
            "content": "You are a helpful assistant"
         },
         {
            "role": "user",
            "content": "Hello"
         },
         {
            "role": "assistant",
            "content": "Hello! How can I assist you today?"
         },
         {
            "role": "user",
            "content": "Write an essay about the importance of higher education.",
         },
      ]
-      # Create a sampling params object.
+# Create an LLM.
-      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.chat(conversation, sampling_params)
-      # Create an LLM.
+# Print the outputs.
-      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+for output in outputs:
-               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+   prompt = output.prompt
-      # Generate texts from the prompts. The output is a list of RequestOutput objects
+   generated_text = output.outputs[0].text
-      # that contain the prompt, generated text, and other information.
+   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-      outputs = llm.chat(conversation, sampling_params)
+```
      # Print the outputs.
      for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
      ```
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@ -21,7 +21,7 @@ for more details on this and other advanced features.
 You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq).
-```bash
+```console
 pip install -U gptqmodel --no-build-isolation -v
 ```
@ -31,36 +31,34 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
-??? Code
+```python
 from datasets import load_dataset
 from gptqmodel import GPTQModel, QuantizeConfig
-    ```python
+model_id = "meta-llama/Llama-3.2-1B-Instruct"
-    from datasets import load_dataset
+quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
    from gptqmodel import GPTQModel, QuantizeConfig
-    model_id = "meta-llama/Llama-3.2-1B-Instruct"
+calibration_dataset = load_dataset(
-    quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+    "allenai/c4",
    data_files="en/c4-train.00001-of-01024.json.gz",
    split="train"
  ).select(range(1024))["text"]
-    calibration_dataset = load_dataset(
+quant_config = QuantizeConfig(bits=4, group_size=128)
        "allenai/c4",
        data_files="en/c4-train.00001-of-01024.json.gz",
        split="train"
    ).select(range(1024))["text"]
-    quant_config = QuantizeConfig(bits=4, group_size=128)
+model = GPTQModel.load(model_id, quant_config)
-    model = GPTQModel.load(model_id, quant_config)
+# increase `batch_size` to match gpu/vram specs to speed up quantization
 model.quantize(calibration_dataset, batch_size=2)
-    # increase `batch_size` to match gpu/vram specs to speed up quantization
+model.save(quant_path)
-    model.quantize(calibration_dataset, batch_size=2)
+```
    model.save(quant_path)
    ```
 ## Running a quantized model with vLLM
 To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
-```bash
+```console
 python examples/offline_inference/llm_engine_example.py \
    --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
 ```
@ -69,34 +67,32 @@ python examples/offline_inference/llm_engine_example.py \
 GPTQModel quantized models are also supported directly through the LLM entrypoint:
-??? Code
+```python
 from vllm import LLM, SamplingParams
-    ```python
+# Sample prompts.
-    from vllm import LLM, SamplingParams
+prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
-    # Sample prompts.
+# Create a sampling params object.
-    prompts = [
+sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
-    # Create a sampling params object.
+# Create an LLM.
-    sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
+llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
-    # Create an LLM.
+# Generate texts from the prompts. The output is a list of RequestOutput objects
-    llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+# that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
+# Print the outputs.
-    # that contain the prompt, generated text, and other information.
+print("-"*50)
-    outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
-
+    prompt = output.prompt
-    # Print the outputs.
+    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
    print("-"*50)
-    for output in outputs:
+```
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
        print("-"*50)
    ```
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@ -14,13 +14,13 @@ Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs re
 To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
-```bash
+```console
 pip install llmcompressor
 ```
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
-```bash
+```console
 pip install vllm lm-eval==0.4.4
 ```
@ -53,55 +53,51 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
-??? Code
+```python
 from datasets import load_dataset
-    ```python
+NUM_CALIBRATION_SAMPLES = 512
-    from datasets import load_dataset
+MAX_SEQUENCE_LENGTH = 2048
-    NUM_CALIBRATION_SAMPLES = 512
+# Load and preprocess the dataset
-    MAX_SEQUENCE_LENGTH = 2048
+ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
 ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-    # Load and preprocess the dataset
+def preprocess(example):
-    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+ds = ds.map(preprocess)
-    def preprocess(example):
+def tokenize(sample):
-        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
-    ds = ds.map(preprocess)
+ds = ds.map(tokenize, remove_columns=ds.column_names)
-
+```
    def tokenize(sample):
        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
    ds = ds.map(tokenize, remove_columns=ds.column_names)
    ```
 ### 3. Applying Quantization
 Now, apply the quantization algorithms:
-??? Code
+```python
 from llmcompressor.transformers import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-    ```python
+# Configure the quantization algorithms
-    from llmcompressor.transformers import oneshot
+recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
    from llmcompressor.modifiers.quantization import GPTQModifier
    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-    # Configure the quantization algorithms
+# Apply quantization
-    recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
-    # Apply quantization
+# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
-    oneshot(
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
-        model=model,
+model.save_pretrained(SAVE_DIR, save_compressed=True)
-        dataset=ds,
+tokenizer.save_pretrained(SAVE_DIR)
-        recipe=recipe,
+```
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    )
    # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
    SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
    model.save_pretrained(SAVE_DIR, save_compressed=True)
    tokenizer.save_pretrained(SAVE_DIR)
    ```
 This process creates a W4A16 model with weights quantized to 4-bit integers.
@ -116,8 +112,8 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
 To evaluate accuracy, you can use `lm_eval`:
-```bash
+```console
-lm_eval --model vllm \
+$ lm_eval --model vllm \
  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \
  --tasks gsm8k \
  --num_fewshot 5 \
@ -141,36 +137,34 @@ lm_eval --model vllm \
 The following is an example of an expanded quantization recipe you can tune to your own use case:
-??? Code
+```python
-
+from compressed_tensors.quantization import (
-    ```python
+    QuantizationArgs,
-    from compressed_tensors.quantization import (
+    QuantizationScheme,
-        QuantizationArgs,
+    QuantizationStrategy,
-        QuantizationScheme,
+    QuantizationType,
-        QuantizationStrategy,
+) 
-        QuantizationType,
+recipe = GPTQModifier(
-    ) 
+    targets="Linear",
-    recipe = GPTQModifier(
+    config_groups={
-        targets="Linear",
+        "config_group": QuantizationScheme(
-        config_groups={
+            targets=["Linear"],
-            "config_group": QuantizationScheme(
+            weights=QuantizationArgs(
-                targets=["Linear"],
+                num_bits=4,
-                weights=QuantizationArgs(
+                type=QuantizationType.INT,
-                    num_bits=4,
+                strategy=QuantizationStrategy.GROUP,
-                    type=QuantizationType.INT,
+                group_size=128,
-                    strategy=QuantizationStrategy.GROUP,
+                symmetric=True,
-                    group_size=128,
+                dynamic=False,
-                    symmetric=True,
+                actorder="weight",
                    dynamic=False,
                    actorder="weight",
                ),
            ),
-        },
+        ),
-        ignore=["lm_head"],
+    },
-        update_size=NUM_CALIBRATION_SAMPLES,
+    ignore=["lm_head"],
-        dampening_frac=0.01
+    update_size=NUM_CALIBRATION_SAMPLES,
-    )
+    dampening_frac=0.01
-    ```
+)
 ```
 ## Troubleshooting and Support
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@ -15,13 +15,13 @@ Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs re
 To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
-```bash
+```console
 pip install llmcompressor
 ```
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
-```bash
+```console
 pip install vllm lm-eval==0.4.4
 ```
@ -54,60 +54,54 @@ When quantizing activations to INT8, you need sample data to estimate the activa
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
-??? Code
+```python
 from datasets import load_dataset
-    ```python
+NUM_CALIBRATION_SAMPLES = 512
-    from datasets import load_dataset
+MAX_SEQUENCE_LENGTH = 2048
-    NUM_CALIBRATION_SAMPLES = 512
+# Load and preprocess the dataset
-    MAX_SEQUENCE_LENGTH = 2048
+ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
 ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-    # Load and preprocess the dataset
+def preprocess(example):
-    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+ds = ds.map(preprocess)
-    def preprocess(example):
+def tokenize(sample):
-        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
-    ds = ds.map(preprocess)
+ds = ds.map(tokenize, remove_columns=ds.column_names)
-
+```
    def tokenize(sample):
        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
    ds = ds.map(tokenize, remove_columns=ds.column_names)
    ```
 </details>
 ### 3. Applying Quantization
 Now, apply the quantization algorithms:
-??? Code
+```python
 from llmcompressor.transformers import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-    ```python
+# Configure the quantization algorithms
-    from llmcompressor.transformers import oneshot
+recipe = [
-    from llmcompressor.modifiers.quantization import GPTQModifier
+    SmoothQuantModifier(smoothing_strength=0.8),
-    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
 ]
-    # Configure the quantization algorithms
+# Apply quantization
-    recipe = [
+oneshot(
-        SmoothQuantModifier(smoothing_strength=0.8),
+    model=model,
-        GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+    dataset=ds,
-    ]
+    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
-    # Apply quantization
+# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
-    oneshot(
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-        model=model,
+model.save_pretrained(SAVE_DIR, save_compressed=True)
-        dataset=ds,
+tokenizer.save_pretrained(SAVE_DIR)
-        recipe=recipe,
+```
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    )
    # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
    SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
    model.save_pretrained(SAVE_DIR, save_compressed=True)
    tokenizer.save_pretrained(SAVE_DIR)
    ```
 This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
@ -122,8 +116,8 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
 To evaluate accuracy, you can use `lm_eval`:
-```bash
+```console
-lm_eval --model vllm \
+$ lm_eval --model vllm \
  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \
  --tasks gsm8k \
  --num_fewshot 5 \
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@ -4,7 +4,7 @@ The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-O
 We recommend installing the library with:
-```bash
+```console
 pip install nvidia-modelopt
 ```
@ -14,26 +14,24 @@ You can quantize HuggingFace models using the example scripts provided in the Te
 Below is an example showing how to quantize a model using modelopt's PTQ API:
-??? Code
+```python
 import modelopt.torch.quantization as mtq
 from transformers import AutoModelForCausalLM
-    ```python
+# Load the model from HuggingFace
-    import modelopt.torch.quantization as mtq
+model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
    from transformers import AutoModelForCausalLM
-    # Load the model from HuggingFace
+# Select the quantization config, for example, FP8
-    model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
+config = mtq.FP8_DEFAULT_CFG
-    # Select the quantization config, for example, FP8
+# Define a forward loop function for calibration
-    config = mtq.FP8_DEFAULT_CFG
+def forward_loop(model):
    for data in calib_set:
        model(data)
-    # Define a forward loop function for calibration
+# PTQ with in-place replacement of quantized modules
-    def forward_loop(model):
+model = mtq.quantize(model, config, forward_loop)
-        for data in calib_set:
+```
            model(data)
    # PTQ with in-place replacement of quantized modules
    model = mtq.quantize(model, config, forward_loop)
    ```
 After the model is quantized, you can export it to a quantized checkpoint using the export API:
@ -50,33 +48,31 @@ with torch.inference_mode():
 The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
-??? Code
+```python
 from vllm import LLM, SamplingParams
-    ```python
+def main():
    from vllm import LLM, SamplingParams
-    def main():
+    model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
    # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
    llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
-        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
        llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
-        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
+    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
-        prompts = [
+    outputs = llm.generate(prompts, sampling_params)
            "Hello, my name is",
            "The president of the United States is",
            "The capital of France is",
            "The future of AI is",
        ]
-        outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        for output in outputs:
+if __name__ == "__main__":
-            prompt = output.prompt
+    main()
-            generated_text = output.outputs[0].text
+```
            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    if __name__ == "__main__":
        main()
    ```
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@ -35,22 +35,20 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades
 Here is an example of how to enable FP8 quantization:
-??? Code
+```python
 # To calculate kv cache scales on the fly enable the calculate_kv_scales
 # parameter
-    ```python
+from vllm import LLM, SamplingParams
    # To calculate kv cache scales on the fly enable the calculate_kv_scales
    # parameter
-    from vllm import LLM, SamplingParams
+sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-
+llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-    sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
+          kv_cache_dtype="fp8",
-    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+          calculate_kv_scales=True)
-            kv_cache_dtype="fp8",
+prompt = "London is the capital of"
-            calculate_kv_scales=True)
+out = llm.generate(prompt, sampling_params)[0].outputs[0].text
-    prompt = "London is the capital of"
+print(out)
-    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+```
    print(out)
    ```
 The `kv_cache_dtype` argument specifies the data type for KV cache storage:
 - `"auto"`: Uses the model's default "unquantized" data type
@ -65,7 +63,7 @@ For optimal model quality when using FP8 KV Cache, we recommend using calibrated
 First, install the required dependencies:
-```bash
+```console
 pip install llmcompressor
 ```
@ -73,69 +71,67 @@ pip install llmcompressor
 Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
-??? Code
+```python
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from llmcompressor.transformers import oneshot
-    ```python
+# Select model and load it
-    from datasets import load_dataset
+MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-    from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
-    from llmcompressor.transformers import oneshot
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    # Select model and load it
+# Select calibration dataset
-    MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+DATASET_SPLIT = "train_sft"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    # Select calibration dataset
+# Configure calibration parameters
-    DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
-    DATASET_SPLIT = "train_sft"
+MAX_SEQUENCE_LENGTH = 2048
-    # Configure calibration parameters
+# Load and preprocess dataset
-    NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
-    MAX_SEQUENCE_LENGTH = 2048
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-    # Load and preprocess dataset
+def process_and_tokenize(example):
-    ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    return tokenizer(
-
+        text,
-    def process_and_tokenize(example):
+        padding=False,
-        text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+        max_length=MAX_SEQUENCE_LENGTH,
-        return tokenizer(
+        truncation=True,
-            text,
+        add_special_tokens=False,
            padding=False,
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True,
            add_special_tokens=False,
        )
    ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
    # Configure quantization settings
    recipe = """
    quant_stage:
        quant_modifiers:
            QuantizationModifier:
                kv_cache_scheme:
                    num_bits: 8
                    type: float
                    strategy: tensor
                    dynamic: false
                    symmetric: true
    """
    # Apply quantization
    oneshot(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    )
-    # Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
+ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
-    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+
-    model.save_pretrained(SAVE_DIR, save_compressed=True)
+# Configure quantization settings
-    tokenizer.save_pretrained(SAVE_DIR)
+recipe = """
-    ```
+quant_stage:
    quant_modifiers:
        QuantizationModifier:
            kv_cache_scheme:
                num_bits: 8
                type: float
                strategy: tensor
                dynamic: false
                symmetric: true
 """
 # Apply quantization
 oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 # Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
 ```
 The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@ -13,7 +13,7 @@ AWQ, GPTQ, Rotation and SmoothQuant.
 Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:
-```bash
+```console
 pip install amd-quark
 ```
@ -22,13 +22,13 @@ for more installation details.
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
-```bash
+```console
 pip install vllm lm-eval==0.4.4
 ```
 ## Quantization Process
-After installing Quark, we will use an example to illustrate how to use Quark.
+After installing Quark, we will use an example to illustrate how to use Quark.  
 The Quark quantization process can be listed for 5 steps as below:
 1. Load the model
@ -42,22 +42,20 @@ The Quark quantization process can be listed for 5 steps as below:
 Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
 to fetch model and tokenizer.
-??? Code
+```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
-    ```python
+MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
-    from transformers import AutoTokenizer, AutoModelForCausalLM
+MAX_SEQ_LEN = 512
-    MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
+model = AutoModelForCausalLM.from_pretrained(
-    MAX_SEQ_LEN = 512
+    MODEL_ID, device_map="auto", torch_dtype="auto",
 )
 model.eval()
-    model = AutoModelForCausalLM.from_pretrained(
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
-        MODEL_ID, device_map="auto", torch_dtype="auto",
+tokenizer.pad_token = tokenizer.eos_token
-    )
+```
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
    tokenizer.pad_token = tokenizer.eos_token
    ```
 ### 2. Prepare the Calibration Dataloader
@ -65,24 +63,22 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
 to load calibration data. For more details about how to use calibration datasets efficiently, please refer
 to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
-??? Code
+```python
 from datasets import load_dataset
 from torch.utils.data import DataLoader
-    ```python
+BATCH_SIZE = 1
-    from datasets import load_dataset
+NUM_CALIBRATION_DATA = 512
    from torch.utils.data import DataLoader
-    BATCH_SIZE = 1
+# Load the dataset and get calibration data.
-    NUM_CALIBRATION_DATA = 512
+dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
 text_data = dataset["text"][:NUM_CALIBRATION_DATA]
-    # Load the dataset and get calibration data.
+tokenized_outputs = tokenizer(text_data, return_tensors="pt",
-    dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+    padding=True, truncation=True, max_length=MAX_SEQ_LEN)
-    text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
-
+    batch_size=BATCH_SIZE, drop_last=True)
-    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+```
        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
        batch_size=BATCH_SIZE, drop_last=True)
    ```
 ### 3. Set the Quantization Configuration
@ -98,44 +94,42 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
    AutoSmoothQuant config file for Llama is
    `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
-??? Code
+```python
 from quark.torch.quantization import (Config, QuantizationConfig,
                                     FP8E4M3PerTensorSpec,
                                     load_quant_algo_config_from_file)
-    ```python
+# Define fp8/per-tensor/static spec.
-    from quark.torch.quantization import (Config, QuantizationConfig,
+FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-                                        FP8E4M3PerTensorSpec,
+    is_dynamic=False).to_quantization_spec()
                                        load_quant_algo_config_from_file)
-    # Define fp8/per-tensor/static spec.
+# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-        is_dynamic=False).to_quantization_spec()
+    weight=FP8_PER_TENSOR_SPEC)
-    # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
-    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
-        weight=FP8_PER_TENSOR_SPEC)
+kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
 kv_cache_quant_config = {name :
    QuantizationConfig(input_tensors=global_quant_config.input_tensors,
                       weight=global_quant_config.weight,
                       output_tensors=KV_CACHE_SPEC)
    for name in kv_cache_layer_names_for_llama}
 layer_quant_config = kv_cache_quant_config.copy()
-    # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
+# Define algorithm config by config file.
-    KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
+LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
-    kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
+    'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
-    kv_cache_quant_config = {name :
+algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
                        weight=global_quant_config.weight,
                        output_tensors=KV_CACHE_SPEC)
        for name in kv_cache_layer_names_for_llama}
    layer_quant_config = kv_cache_quant_config.copy()
-    # Define algorithm config by config file.
+EXCLUDE_LAYERS = ["lm_head"]
-    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+quant_config = Config(
-        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+    global_quant_config=global_quant_config,
-    algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+    layer_quant_config=layer_quant_config,
-
+    kv_cache_quant_config=kv_cache_quant_config,
-    EXCLUDE_LAYERS = ["lm_head"]
+    exclude=EXCLUDE_LAYERS,
-    quant_config = Config(
+    algo_config=algo_config)
-        global_quant_config=global_quant_config,
+```
        layer_quant_config=layer_quant_config,
        kv_cache_quant_config=kv_cache_quant_config,
        exclude=EXCLUDE_LAYERS,
        algo_config=algo_config)
    ```
 ### 4. Quantize the Model and Export
@ -145,72 +139,68 @@ HuggingFace `safetensors`, you can refer to
 [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
 for more exporting format details.
-??? Code
+```python
 import torch
 from quark.torch import ModelQuantizer, ModelExporter
 from quark.torch.export import ExporterConfig, JsonExporterConfig
-    ```python
+# Apply quantization.
-    import torch
+quantizer = ModelQuantizer(quant_config)
-    from quark.torch import ModelQuantizer, ModelExporter
+quant_model = quantizer.quantize_model(model, calib_dataloader)
    from quark.torch.export import ExporterConfig, JsonExporterConfig
-    # Apply quantization.
+# Freeze quantized model to export.
-    quantizer = ModelQuantizer(quant_config)
+freezed_model = quantizer.freeze(model)
    quant_model = quantizer.quantize_model(model, calib_dataloader)
-    # Freeze quantized model to export.
+# Define export config.
-    freezed_model = quantizer.freeze(model)
+LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
 export_config = ExporterConfig(json_export_config=JsonExporterConfig())
 export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
-    # Define export config.
+# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
-    LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
+EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
-    export_config = ExporterConfig(json_export_config=JsonExporterConfig())
+exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
-    export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+with torch.no_grad():
-
+    exporter.export_safetensors_model(freezed_model,
-    # Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
+        quant_config=quant_config, tokenizer=tokenizer)
-    EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
+```
    exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
    with torch.no_grad():
        exporter.export_safetensors_model(freezed_model,
            quant_config=quant_config, tokenizer=tokenizer)
    ```
 ### 5. Evaluation in vLLM
 Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
-??? Code
+```python
 from vllm import LLM, SamplingParams
-    ```python
+# Sample prompts.
-    from vllm import LLM, SamplingParams
+prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    # Sample prompts.
+# Create an LLM.
-    prompts = [
+llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-        "Hello, my name is",
+          kv_cache_dtype='fp8',quantization='quark')
-        "The president of the United States is",
+# Generate texts from the prompts. The output is a list of RequestOutput objects
-        "The capital of France is",
+# that contain the prompt, generated text, and other information.
-        "The future of AI is",
+outputs = llm.generate(prompts, sampling_params)
-    ]
+# Print the outputs.
-    # Create a sampling params object.
+print("\nGenerated Outputs:\n" + "-" * 60)
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+for output in outputs:
-
+    prompt = output.prompt
-    # Create an LLM.
+    generated_text = output.outputs[0].text
-    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+    print(f"Prompt:    {prompt!r}")
-            kv_cache_dtype='fp8',quantization='quark')
+    print(f"Output:    {generated_text!r}")
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    print("-" * 60)
-    # that contain the prompt, generated text, and other information.
+```
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    print("\nGenerated Outputs:\n" + "-" * 60)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt:    {prompt!r}")
        print(f"Output:    {generated_text!r}")
        print("-" * 60)
    ```
 Or, you can use `lm_eval` to evaluate accuracy:
-```bash
+```console
-lm_eval --model vllm \
+$ lm_eval --model vllm \
  --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
  --tasks gsm8k
 ```
@ -222,7 +212,7 @@ to quantize large language models more conveniently. It supports quantizing mode
 of different quantization schemes and optimization algorithms. It can export the quantized model
 and run evaluation tasks on the fly. With the script, the example above can be:
-```bash
+```console
 python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
                          --output_dir /path/to/output \
                          --quant_scheme w_fp8_a_fp8 \
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@ -4,7 +4,7 @@ TorchAO is an architecture optimization library for PyTorch, it provides high pe
 We recommend installing the latest torchao nightly with
-```bash
+```console
 # Install the latest TorchAO nightly build
 # Choose the CUDA version that matches your system (cu126, cu128, etc.)
 pip install \
@ -15,28 +15,26 @@ pip install \
 ## Quantizing HuggingFace Models
 You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
-??? Code
+```Python
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
 from torchao.quantization import Int8WeightOnlyConfig
-    ```Python
+model_name = "meta-llama/Meta-Llama-3-8B"
-    import torch
+quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-    from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+quantized_model = AutoModelForCausalLM.from_pretrained(
-    from torchao.quantization import Int8WeightOnlyConfig
+    model_name,
    torch_dtype="auto",
    device_map="auto",
    quantization_config=quantization_config
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-    model_name = "meta-llama/Meta-Llama-3-8B"
+hub_repo = # YOUR HUB REPO ID
-    quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
+tokenizer.push_to_hub(hub_repo)
-    quantized_model = AutoModelForCausalLM.from_pretrained(
+quantized_model.push_to_hub(hub_repo, safe_serialization=False)
-        model_name,
+```
        torch_dtype="auto",
        device_map="auto",
        quantization_config=quantization_config
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    input_text = "What are we having for dinner?"
    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
    hub_repo = # YOUR HUB REPO ID
    tokenizer.push_to_hub(hub_repo)
    quantized_model.push_to_hub(hub_repo, safe_serialization=False)
    ```
 Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -33,36 +33,34 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
 Next, make a request to the model that should return the reasoning content in the response.
-??? Code
+```python
 from openai import OpenAI
-    ```python
+# Modify OpenAI's API key and API base to use vLLM's API server.
-    from openai import OpenAI
+openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-    # Modify OpenAI's API key and API base to use vLLM's API server.
+client = OpenAI(
-    openai_api_key = "EMPTY"
+    api_key=openai_api_key,
-    openai_api_base = "http://localhost:8000/v1"
+    base_url=openai_api_base,
 )
-    client = OpenAI(
+models = client.models.list()
-        api_key=openai_api_key,
+model = models.data[0].id
        base_url=openai_api_base,
    )
-    models = client.models.list()
+# Round 1
-    model = models.data[0].id
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
 # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
 # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
 response = client.chat.completions.create(model=model, messages=messages)
-    # Round 1
+reasoning_content = response.choices[0].message.reasoning_content
-    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+content = response.choices[0].message.content
    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
    response = client.chat.completions.create(model=model, messages=messages)
-    reasoning_content = response.choices[0].message.reasoning_content
+print("reasoning_content:", reasoning_content)
-    content = response.choices[0].message.content
+print("content:", content)
-
+```
    print("reasoning_content:", reasoning_content)
    print("content:", content)
    ```
 The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
@ -70,81 +68,77 @@ The `reasoning_content` field contains the reasoning steps that led to the final
 Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
-??? Json
+```json
-
+{
-    ```json
+    "id": "chatcmpl-123",
-    {
+    "object": "chat.completion.chunk",
-        "id": "chatcmpl-123",
+    "created": 1694268190,
-        "object": "chat.completion.chunk",
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
-        "created": 1694268190,
+    "system_fingerprint": "fp_44709d6fcb",
-        "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    "choices": [
-        "system_fingerprint": "fp_44709d6fcb",
+        {
-        "choices": [
+            "index": 0,
-            {
+            "delta": {
-                "index": 0,
+                "role": "assistant",
-                "delta": {
+                "reasoning_content": "is",
-                    "role": "assistant",
+            },
-                    "reasoning_content": "is",
+            "logprobs": null,
-                },
+            "finish_reason": null
-                "logprobs": null,
+        }
-                "finish_reason": null
+    ]
-            }
+}
-        ]
+```
    }
    ```
 OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
-??? Code
+```python
 from openai import OpenAI
-    ```python
+# Modify OpenAI's API key and API base to use vLLM's API server.
-    from openai import OpenAI
+openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-    # Modify OpenAI's API key and API base to use vLLM's API server.
+client = OpenAI(
-    openai_api_key = "EMPTY"
+    api_key=openai_api_key,
-    openai_api_base = "http://localhost:8000/v1"
+    base_url=openai_api_base,
 )
-    client = OpenAI(
+models = client.models.list()
-        api_key=openai_api_key,
+model = models.data[0].id
        base_url=openai_api_base,
    )
-    models = client.models.list()
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-    model = models.data[0].id
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
 # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
 stream = client.chat.completions.create(model=model,
                                        messages=messages,
                                        stream=True)
-    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+print("client: Start streaming chat completions...")
-    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+printed_reasoning_content = False
-    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
+printed_content = False
    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
    stream = client.chat.completions.create(model=model,
                                            messages=messages,
                                            stream=True)
-    print("client: Start streaming chat completions...")
+for chunk in stream:
-    printed_reasoning_content = False
+    reasoning_content = None
-    printed_content = False
+    content = None
    # Check the content is reasoning_content or content
    if hasattr(chunk.choices[0].delta, "reasoning_content"):
        reasoning_content = chunk.choices[0].delta.reasoning_content
    elif hasattr(chunk.choices[0].delta, "content"):
        content = chunk.choices[0].delta.content
-    for chunk in stream:
+    if reasoning_content is not None:
-        reasoning_content = None
+        if not printed_reasoning_content:
-        content = None
+            printed_reasoning_content = True
-        # Check the content is reasoning_content or content
+            print("reasoning_content:", end="", flush=True)
-        if hasattr(chunk.choices[0].delta, "reasoning_content"):
+        print(reasoning_content, end="", flush=True)
-            reasoning_content = chunk.choices[0].delta.reasoning_content
+    elif content is not None:
-        elif hasattr(chunk.choices[0].delta, "content"):
+        if not printed_content:
-            content = chunk.choices[0].delta.content
+            printed_content = True
-
+            print("\ncontent:", end="", flush=True)
-        if reasoning_content is not None:
+        # Extract and print the content
-            if not printed_reasoning_content:
+        print(content, end="", flush=True)
-                printed_reasoning_content = True
+```
                print("reasoning_content:", end="", flush=True)
            print(reasoning_content, end="", flush=True)
        elif content is not None:
            if not printed_content:
                printed_content = True
                print("\ncontent:", end="", flush=True)
            # Extract and print the content
            print(content, end="", flush=True)
    ```
 Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
@ -152,43 +146,41 @@ Remember to check whether the `reasoning_content` exists in the response before
 The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
-??? Code
+```python
 from openai import OpenAI
-    ```python
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
    from openai import OpenAI
-    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+tools = [{
-
+    "type": "function",
-    tools = [{
+    "function": {
-        "type": "function",
+        "name": "get_weather",
-        "function": {
+        "description": "Get the current weather in a given location",
-            "name": "get_weather",
+        "parameters": {
-            "description": "Get the current weather in a given location",
+            "type": "object",
-            "parameters": {
+            "properties": {
-                "type": "object",
+                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                "properties": {
+                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
-                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+            },
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+            "required": ["location", "unit"]
                },
                "required": ["location", "unit"]
            }
        }
-    }]
+    }
 }]
-    response = client.chat.completions.create(
+response = client.chat.completions.create(
-        model=client.models.list().data[0].id,
+    model=client.models.list().data[0].id,
-        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
-        tools=tools,
+    tools=tools,
-        tool_choice="auto"
+    tool_choice="auto"
-    )
+)
-    print(response)
+print(response)
-    tool_call = response.choices[0].message.tool_calls[0].function
+tool_call = response.choices[0].message.tool_calls[0].function
-    print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
+print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
-    print(f"Function called: {tool_call.name}")
+print(f"Function called: {tool_call.name}")
-    print(f"Arguments: {tool_call.arguments}")
+print(f"Arguments: {tool_call.arguments}")
-    ```
+```
 For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
@ -200,89 +192,85 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
 You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
-??? Code
+```python
 # import the required packages
-    ```python
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
-    # import the required packages
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaMessage)
-    from vllm.reasoning import ReasoningParser, ReasoningParserManager
+# define a reasoning parser and register it to vllm
-    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+# the name list in register_module can be used
-                                                DeltaMessage)
+# in --reasoning-parser.
@ReasoningParserManager.register_module(["example"])
 class ExampleParser(ReasoningParser):
    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)
-    # define a reasoning parser and register it to vllm
+    def extract_reasoning_content_streaming(
-    # the name list in register_module can be used
+        self,
-    # in --reasoning-parser.
+        previous_text: str,
-    @ReasoningParserManager.register_module(["example"])
+        current_text: str,
-    class ExampleParser(ReasoningParser):
+        delta_text: str,
-        def __init__(self, tokenizer: AnyTokenizer):
+        previous_token_ids: Sequence[int],
-            super().__init__(tokenizer)
+        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> Union[DeltaMessage, None]:
        """
        Instance method that should be implemented for extracting reasoning
        from an incomplete response; for use when handling reasoning calls and
        streaming. Has to be an instance method because  it requires state -
        the current tokens/diffs, but also the information about what has
        previously been parsed and extracted (see constructor)
        """
-        def extract_reasoning_content_streaming(
+    def extract_reasoning_content(
-            self,
+            self, model_output: str, request: ChatCompletionRequest
-            previous_text: str,
+    ) -> tuple[Optional[str], Optional[str]]:
-            current_text: str,
+        """
-            delta_text: str,
+        Extract reasoning content from a complete model-generated string.
            previous_token_ids: Sequence[int],
            current_token_ids: Sequence[int],
            delta_token_ids: Sequence[int],
        ) -> Union[DeltaMessage, None]:
            """
            Instance method that should be implemented for extracting reasoning
            from an incomplete response; for use when handling reasoning calls and
            streaming. Has to be an instance method because  it requires state -
            the current tokens/diffs, but also the information about what has
            previously been parsed and extracted (see constructor)
            """
-        def extract_reasoning_content(
+        Used for non-streaming responses where we have the entire model response
-                self, model_output: str, request: ChatCompletionRequest
+        available before sending to the client.
        ) -> tuple[Optional[str], Optional[str]]:
            """
            Extract reasoning content from a complete model-generated string.
-            Used for non-streaming responses where we have the entire model response
+        Parameters:
-            available before sending to the client.
+        model_output: str
            The model-generated string to extract reasoning content from.
-            Parameters:
+        request: ChatCompletionRequest
-            model_output: str
+            The request object that was used to generate the model_output.
                The model-generated string to extract reasoning content from.
-            request: ChatCompletionRequest
+        Returns:
-                The request object that was used to generate the model_output.
+        tuple[Optional[str], Optional[str]]
-
+            A tuple containing the reasoning content and the content.
-            Returns:
+        """
-            tuple[Optional[str], Optional[str]]
+```
                A tuple containing the reasoning content and the content.
            """
    ```
 Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
-??? Code
+```python
@dataclass
 class DeepSeekReasoner(Reasoner):
    """
    Reasoner for DeepSeek R series models.
    """
    start_token_id: int
    end_token_id: int
-    ```python
+    start_token: str = "<think>"
-    @dataclass
+    end_token: str = "</think>"
    class DeepSeekReasoner(Reasoner):
        """
        Reasoner for DeepSeek R series models.
        """
        start_token_id: int
        end_token_id: int
-        start_token: str = "<think>"
+    @classmethod
-        end_token: str = "</think>"
+    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
        return cls(start_token_id=tokenizer.encode(
            "<think>", add_special_tokens=False)[0],
                   end_token_id=tokenizer.encode("</think>",
                                                 add_special_tokens=False)[0])
-        @classmethod
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
+        return self.end_token_id in input_ids
-            return cls(start_token_id=tokenizer.encode(
+    ...
-                "<think>", add_special_tokens=False)[0],
+```
                    end_token_id=tokenizer.encode("</think>",
                                                    add_special_tokens=False)[0])
        def is_reasoning_end(self, input_ids: list[int]) -> bool:
            return self.end_token_id in input_ids
        ...
    ```
 The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@ -18,31 +18,29 @@ Speculative decoding is a technique which improves inter-token latency in memory
 The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
-??? Code
+```python
 from vllm import LLM, SamplingParams
-    ```python
+prompts = [
-    from vllm import LLM, SamplingParams
+    "The future of AI is",
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    prompts = [
+llm = LLM(
-        "The future of AI is",
+    model="facebook/opt-6.7b",
-    ]
+    tensor_parallel_size=1,
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    speculative_config={
        "model": "facebook/opt-125m",
        "num_speculative_tokens": 5,
    },
 )
 outputs = llm.generate(prompts, sampling_params)
-    llm = LLM(
+for output in outputs:
-        model="facebook/opt-6.7b",
+    prompt = output.prompt
-        tensor_parallel_size=1,
+    generated_text = output.outputs[0].text
-        speculative_config={
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-            "model": "facebook/opt-125m",
+```
            "num_speculative_tokens": 5,
        },
    )
    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```
 To perform the same with an online mode launch the server:
@ -62,73 +60,69 @@ python -m vllm.entrypoints.openai.api_server \
 Then use a client:
-??? Code
+```python
 from openai import OpenAI
-    ```python
+# Modify OpenAI's API key and API base to use vLLM's API server.
-    from openai import OpenAI
+openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
-    # Modify OpenAI's API key and API base to use vLLM's API server.
+client = OpenAI(
-    openai_api_key = "EMPTY"
+    # defaults to os.environ.get("OPENAI_API_KEY")
-    openai_api_base = "http://localhost:8000/v1"
+    api_key=openai_api_key,
    base_url=openai_api_base,
 )
-    client = OpenAI(
+models = client.models.list()
-        # defaults to os.environ.get("OPENAI_API_KEY")
+model = models.data[0].id
        api_key=openai_api_key,
        base_url=openai_api_base,
    )
-    models = client.models.list()
+# Completion API
-    model = models.data[0].id
+stream = False
 completion = client.completions.create(
    model=model,
    prompt="The future of AI is",
    echo=False,
    n=1,
    stream=stream,
 )
-    # Completion API
+print("Completion results:")
-    stream = False
+if stream:
-    completion = client.completions.create(
+    for c in completion:
-        model=model,
+        print(c)
-        prompt="The future of AI is",
+else:
-        echo=False,
+    print(completion)
-        n=1,
+```
        stream=stream,
    )
    print("Completion results:")
    if stream:
        for c in completion:
            print(c)
    else:
        print(completion)
    ```
 ## Speculating by matching n-grams in the prompt
 The following code configures vLLM to use speculative decoding where proposals are generated by
 matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
-??? Code
+```python
 from vllm import LLM, SamplingParams
-    ```python
+prompts = [
-    from vllm import LLM, SamplingParams
+    "The future of AI is",
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    prompts = [
+llm = LLM(
-        "The future of AI is",
+    model="facebook/opt-6.7b",
-    ]
+    tensor_parallel_size=1,
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    speculative_config={
        "method": "ngram",
        "num_speculative_tokens": 5,
        "prompt_lookup_max": 4,
    },
 )
 outputs = llm.generate(prompts, sampling_params)
-    llm = LLM(
+for output in outputs:
-        model="facebook/opt-6.7b",
+    prompt = output.prompt
-        tensor_parallel_size=1,
+    generated_text = output.outputs[0].text
-        speculative_config={
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-            "method": "ngram",
+```
            "num_speculative_tokens": 5,
            "prompt_lookup_max": 4,
        },
    )
    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```
 ## Speculating using MLP speculators
@ -137,31 +131,29 @@ draft models that conditioning draft predictions on both context vectors and sam
 For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
 [this technical report](https://arxiv.org/abs/2404.19124).
-??? Code
+```python
 from vllm import LLM, SamplingParams
-    ```python
+prompts = [
-    from vllm import LLM, SamplingParams
+    "The future of AI is",
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    prompts = [
+llm = LLM(
-        "The future of AI is",
+    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
-    ]
+    tensor_parallel_size=4,
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    speculative_config={
        "model": "ibm-ai-platform/llama3-70b-accelerator",
        "draft_tensor_parallel_size": 1,
    },
 )
 outputs = llm.generate(prompts, sampling_params)
-    llm = LLM(
+for output in outputs:
-        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
+    prompt = output.prompt
-        tensor_parallel_size=4,
+    generated_text = output.outputs[0].text
-        speculative_config={
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-            "model": "ibm-ai-platform/llama3-70b-accelerator",
+```
            "draft_tensor_parallel_size": 1,
        },
    )
    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```
 Note that these speculative models currently need to be run without tensor parallelism, although
 it is possible to run the main model using tensor parallelism (see example above). Since the
@ -185,33 +177,31 @@ A variety of speculative models of this type are available on HF hub:
 The following code configures vLLM to use speculative decoding where proposals are generated by
 an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
-??? Code
+```python
 from vllm import LLM, SamplingParams
-    ```python
+prompts = [
-    from vllm import LLM, SamplingParams
+    "The future of AI is",
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    prompts = [
+llm = LLM(
-        "The future of AI is",
+    model="meta-llama/Meta-Llama-3-8B-Instruct",
-    ]
+    tensor_parallel_size=4,
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    speculative_config={
        "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
        "draft_tensor_parallel_size": 1,
    },
 )
-    llm = LLM(
+outputs = llm.generate(prompts, sampling_params)
        model="meta-llama/Meta-Llama-3-8B-Instruct",
        tensor_parallel_size=4,
        speculative_config={
            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
            "draft_tensor_parallel_size": 1,
        },
    )
-    outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    for output in outputs:
+```
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```
 A few important things to consider when using the EAGLE based draft models:
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -33,43 +33,39 @@ text.
 Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
-??? Code
+```python
 from openai import OpenAI
 client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="-",
 )
 model = client.models.list().data[0].id
-    ```python
+completion = client.chat.completions.create(
-    from openai import OpenAI
+    model=model,
-    client = OpenAI(
+    messages=[
-        base_url="http://localhost:8000/v1",
+        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-        api_key="-",
+    ],
-    )
+    extra_body={"guided_choice": ["positive", "negative"]},
-    model = client.models.list().data[0].id
+)
-
+print(completion.choices[0].message.content)
-    completion = client.chat.completions.create(
+```
        model=model,
        messages=[
            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
        ],
        extra_body={"guided_choice": ["positive", "negative"]},
    )
    print(completion.choices[0].message.content)
    ```
 The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
-??? Code
+```python
-
+completion = client.chat.completions.create(
-    ```python
+    model=model,
-    completion = client.chat.completions.create(
+    messages=[
-        model=model,
+        {
-        messages=[
+            "role": "user",
-            {
+            "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
-                "role": "user",
+        }
-                "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
+    ],
-            }
+    extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
-        ],
+)
-        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
+print(completion.choices[0].message.content)
-    )
+```
    print(completion.choices[0].message.content)
    ```
 One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
 For this we can use the `guided_json` parameter in two different ways:
@ -79,43 +75,41 @@ For this we can use the `guided_json` parameter in two different ways:
 The next example shows how to use the `guided_json` parameter with a Pydantic model:
-??? Code
+```python
 from pydantic import BaseModel
 from enum import Enum
-    ```python
+class CarType(str, Enum):
-    from pydantic import BaseModel
+    sedan = "sedan"
-    from enum import Enum
+    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"
-    class CarType(str, Enum):
+class CarDescription(BaseModel):
-        sedan = "sedan"
+    brand: str
-        suv = "SUV"
+    model: str
-        truck = "Truck"
+    car_type: CarType
        coupe = "Coupe"
-    class CarDescription(BaseModel):
+json_schema = CarDescription.model_json_schema()
        brand: str
        model: str
        car_type: CarType
-    json_schema = CarDescription.model_json_schema()
+completion = client.chat.completions.create(
-
+    model=model,
-    completion = client.chat.completions.create(
+    messages=[
-        model=model,
+        {
-        messages=[
+            "role": "user",
-            {
+            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
-                "role": "user",
+        }
-                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+    ],
-            }
+    "response_format": {
-        ],
+        "type": "json_schema",
-        "response_format": {
+        "json_schema": {
-            "type": "json_schema",
+            "name": "car-description",
-            "json_schema": {
+            "schema": CarDescription.model_json_schema()
                "name": "car-description",
                "schema": CarDescription.model_json_schema()
            },
        },
-    )
+    },
-    print(completion.choices[0].message.content)
+)
-    ```
+print(completion.choices[0].message.content)
 ```
 !!! tip
    While not strictly necessary, normally it´s better to indicate in the prompt the
@ -127,35 +121,33 @@ difficult to use, but it´s really powerful. It allows us to define complete
 languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:
-??? Code
+```python
 simplified_sql_grammar = """
    root ::= select_statement
-    ```python
+    select_statement ::= "SELECT " column " from " table " where " condition
    simplified_sql_grammar = """
        root ::= select_statement
-        select_statement ::= "SELECT " column " from " table " where " condition
+    column ::= "col_1 " | "col_2 "
-        column ::= "col_1 " | "col_2 "
+    table ::= "table_1 " | "table_2 "
-        table ::= "table_1 " | "table_2 "
+    condition ::= column "= " number
-        condition ::= column "= " number
+    number ::= "1 " | "2 "
 """
-        number ::= "1 " | "2 "
+completion = client.chat.completions.create(
-    """
+    model=model,
-
+    messages=[
-    completion = client.chat.completions.create(
+        {
-        model=model,
+            "role": "user",
-        messages=[
+            "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
-            {
+        }
-                "role": "user",
+    ],
-                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
+    extra_body={"guided_grammar": simplified_sql_grammar},
-            }
+)
-        ],
+print(completion.choices[0].message.content)
-        extra_body={"guided_grammar": simplified_sql_grammar},
+```
    )
    print(completion.choices[0].message.content)
    ```
 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
@ -169,36 +161,34 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r
 Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
-??? Code
+```python
-
+from pydantic import BaseModel
    ```python
    from pydantic import BaseModel
-    class People(BaseModel):
+class People(BaseModel):
-        name: str
+    name: str
-        age: int
+    age: int
-    completion = client.chat.completions.create(
+completion = client.chat.completions.create(
-        model=model,
+    model=model,
-        messages=[
+    messages=[
-            {
+        {
-                "role": "user",
+            "role": "user",
-                "content": "Generate a JSON with the name and age of one random person.",
+            "content": "Generate a JSON with the name and age of one random person.",
-            }
+        }
-        ],
+    ],
-        response_format={
+    response_format={
-            "type": "json_schema",
+        "type": "json_schema",
-            "json_schema": {
+        "json_schema": {
-                "name": "people",
+            "name": "people",
-                "schema": People.model_json_schema()
+            "schema": People.model_json_schema()
-            }
+        }
-        },
+    },
-    )
+)
-    print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-    print("content: ", completion.choices[0].message.content)
+print("content: ", completion.choices[0].message.content)
-    ```
+```
 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
@ -212,33 +202,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.
 Here is a simple example demonstrating how to get structured output using Pydantic models:
-??? Code
+```python
 from pydantic import BaseModel
 from openai import OpenAI
-    ```python
+class Info(BaseModel):
-    from pydantic import BaseModel
+    name: str
-    from openai import OpenAI
+    age: int
-    class Info(BaseModel):
+client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
-        name: str
+model = client.models.list().data[0].id
-        age: int
+completion = client.beta.chat.completions.parse(
    model=model,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
    ],
    response_format=Info,
 )
-    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+message = completion.choices[0].message
-    model = client.models.list().data[0].id
+print(message)
-    completion = client.beta.chat.completions.parse(
+assert message.parsed
-        model=model,
+print("Name:", message.parsed.name)
-        messages=[
+print("Age:", message.parsed.age)
-            {"role": "system", "content": "You are a helpful assistant."},
+```
            {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
        ],
        response_format=Info,
    )
-    message = completion.choices[0].message
+Output:
    print(message)
    assert message.parsed
    print("Name:", message.parsed.name)
    print("Age:", message.parsed.age)
    ```
 ```console
 ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
@ -248,37 +238,35 @@ Age: 28
 Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
-??? Code
+```python
 from typing import List
 from pydantic import BaseModel
 from openai import OpenAI
-    ```python
+class Step(BaseModel):
-    from typing import List
+    explanation: str
-    from pydantic import BaseModel
+    output: str
    from openai import OpenAI
-    class Step(BaseModel):
+class MathResponse(BaseModel):
-        explanation: str
+    steps: list[Step]
-        output: str
+    final_answer: str
-    class MathResponse(BaseModel):
+completion = client.beta.chat.completions.parse(
-        steps: list[Step]
+    model=model,
-        final_answer: str
+    messages=[
        {"role": "system", "content": "You are a helpful expert math tutor."},
        {"role": "user", "content": "Solve 8x + 31 = 2."},
    ],
    response_format=MathResponse,
 )
-    completion = client.beta.chat.completions.parse(
+message = completion.choices[0].message
-        model=model,
+print(message)
-        messages=[
+assert message.parsed
-            {"role": "system", "content": "You are a helpful expert math tutor."},
+for i, step in enumerate(message.parsed.steps):
-            {"role": "user", "content": "Solve 8x + 31 = 2."},
+    print(f"Step #{i}:", step)
-        ],
+print("Answer:", message.parsed.final_answer)
-        response_format=MathResponse,
+```
    )
    message = completion.choices[0].message
    print(message)
    assert message.parsed
    for i, step in enumerate(message.parsed.steps):
        print(f"Step #{i}:", step)
    print("Answer:", message.parsed.final_answer)
    ```
 Output:
@ -308,21 +296,19 @@ These parameters can be used in the same way as the parameters from the Online
 Serving examples above. One example for the usage of the `choice` parameter is
 shown below:
-??? Code
+```python
 from vllm import LLM, SamplingParams
 from vllm.sampling_params import GuidedDecodingParams
-    ```python
+llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
    from vllm import LLM, SamplingParams
    from vllm.sampling_params import GuidedDecodingParams
-    llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
+guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+outputs = llm.generate(
-    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+    prompts="Classify this sentiment: vLLM is wonderful!",
-    outputs = llm.generate(
+    sampling_params=sampling_params,
-        prompts="Classify this sentiment: vLLM is wonderful!",
+)
-        sampling_params=sampling_params,
+print(outputs[0].outputs[0].text)
-    )
+```
    print(outputs[0].outputs[0].text)
    ```
 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -15,46 +15,44 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \
 Next, make a request to the model that should result in it using the available tools:
-??? Code
+```python
 from openai import OpenAI
 import json
-    ```python
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
    from openai import OpenAI
    import json
-    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+def get_weather(location: str, unit: str):
    return f"Getting the weather for {location} in {unit}..."
 tool_functions = {"get_weather": get_weather}
-    def get_weather(location: str, unit: str):
+tools = [{
-        return f"Getting the weather for {location} in {unit}..."
+    "type": "function",
-    tool_functions = {"get_weather": get_weather}
+    "function": {
-
+        "name": "get_weather",
-    tools = [{
+        "description": "Get the current weather in a given location",
-        "type": "function",
+        "parameters": {
-        "function": {
+            "type": "object",
-            "name": "get_weather",
+            "properties": {
-            "description": "Get the current weather in a given location",
+                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-            "parameters": {
+                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
-                "type": "object",
+            },
-                "properties": {
+            "required": ["location", "unit"]
                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                },
                "required": ["location", "unit"]
            }
        }
-    }]
+    }
 }]
-    response = client.chat.completions.create(
+response = client.chat.completions.create(
-        model=client.models.list().data[0].id,
+    model=client.models.list().data[0].id,
-        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
-        tools=tools,
+    tools=tools,
-        tool_choice="auto"
+    tool_choice="auto"
-    )
+)
-    tool_call = response.choices[0].message.tool_calls[0].function
+tool_call = response.choices[0].message.tool_calls[0].function
-    print(f"Function called: {tool_call.name}")
+print(f"Function called: {tool_call.name}")
-    print(f"Arguments: {tool_call.arguments}")
+print(f"Arguments: {tool_call.arguments}")
-    print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
+print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
-    ```
+```
 Example output:
@ -228,25 +226,6 @@ AI21's Jamba-1.5 models are supported.
 Flags: `--tool-call-parser jamba`
 ### xLAM Models (`xlam`)
 The xLAM tool parser is designed to support models that generate tool calls in various JSON formats. It detects function calls in several different output styles:
 1. Direct JSON arrays: Output strings that are JSON arrays starting with `[` and ending with `]`
 2. Thinking tags: Using `<think>...</think>` tags containing JSON arrays
 3. Code blocks: JSON in code blocks (```json ...```)
 4. Tool calls tags: Using `[TOOL_CALLS]` or `<tool_call>...</tool_call>` tags
 Parallel function calls are supported, and the parser can effectively separate text content from tool calls.
 Supported models:
 * Salesforce Llama-xLAM models: `Salesforce/Llama-xLAM-2-8B-fc-r`, `Salesforce/Llama-xLAM-2-70B-fc-r`
 * Qwen-xLAM models: `Salesforce/xLAM-1B-fc-r`, `Salesforce/xLAM-3B-fc-r`, `Salesforce/Qwen-xLAM-32B-fc-r`
 Flags:
 * For Llama-based xLAM models: `--tool-call-parser xlam --chat-template examples/tool_chat_template_xlam_llama.jinja`
 * For Qwen-based xLAM models: `--tool-call-parser xlam --chat-template examples/tool_chat_template_xlam_qwen.jinja`
 ### Qwen Models
 For Qwen2.5, the chat template in tokenizer_config.json has already included support for the Hermes-style tool use. Therefore, you can use the `hermes` parser to enable tool calls for Qwen models. For more detailed information, please refer to the official [Qwen documentation](https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm)
@ -303,55 +282,53 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen
 Here is a summary of a plugin file:
-??? Code
+```python
-    ```python
+# import the required packages
-    # import the required packages
+# define a tool parser and register it to vllm
 # the name list in register_module can be used
 # in --tool-call-parser. you can define as many
 # tool parsers as you want here.
@ToolParserManager.register_module(["example"])
 class ExampleToolParser(ToolParser):
    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)
-    # define a tool parser and register it to vllm
+    # adjust request. e.g.: set skip special tokens
-    # the name list in register_module can be used
+    # to False for tool call output.
-    # in --tool-call-parser. you can define as many
+    def adjust_request(
-    # tool parsers as you want here.
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-    @ToolParserManager.register_module(["example"])
+        return request
    class ExampleToolParser(ToolParser):
        def __init__(self, tokenizer: AnyTokenizer):
            super().__init__(tokenizer)
-        # adjust request. e.g.: set skip special tokens
+    # implement the tool call parse for stream call
-        # to False for tool call output.
+    def extract_tool_calls_streaming(
-        def adjust_request(
+        self,
-                self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        previous_text: str,
-            return request
+        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        return delta
-        # implement the tool call parse for stream call
+    # implement the tool parse for non-stream call
-        def extract_tool_calls_streaming(
+    def extract_tool_calls(
-            self,
+        self,
-            previous_text: str,
+        model_output: str,
-            current_text: str,
+        request: ChatCompletionRequest,
-            delta_text: str,
+    ) -> ExtractedToolCallInformation:
-            previous_token_ids: Sequence[int],
+        return ExtractedToolCallInformation(tools_called=False,
-            current_token_ids: Sequence[int],
+                                            tool_calls=[],
-            delta_token_ids: Sequence[int],
+                                            content=text)
            request: ChatCompletionRequest,
        ) -> Union[DeltaMessage, None]:
            return delta
-        # implement the tool parse for non-stream call
+```
        def extract_tool_calls(
            self,
            model_output: str,
            request: ChatCompletionRequest,
        ) -> ExtractedToolCallInformation:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=text)
    ```
 Then you can use this plugin in the command line like this.
-```bash
+```console
    --enable-auto-tool-choice \
    --tool-parser-plugin <absolute path of the plugin file>
    --tool-call-parser example \
--- a/Show More
+++ b/Show More